From e513b6ffa5a2914f2882e0e82fe940d31f42e92e Mon Sep 17 00:00:00 2001 From: SmallCoccinelle <89733524+SmallCoccinelle@users.noreply.github.com> Date: Wed, 20 Oct 2021 07:12:24 +0200 Subject: [PATCH] Cache and reuse the scraper HTTP client (#1855) * Add Cookies directly to the request Rather than maintaining a cookie jar on a one-shot HTTP client, maintain the jar ourselves: make a new jar, then use it to select the right cookies. The cookies are set on the request rather than on the client. This will retain the current behavior as we are always throwing the client away after each use. This patch enables the lifting of the http client as well over time. * Introduce a cached scraper HTTP client The scraper cache is augmented with an *http.Client. These are safe for concurrent use, so the pointer can safely be passed around. Push this into scraper configurations where applicable, next to the txnManagers. When we issue a loadUrl request, do so on the cached *http.Client, which will reuse existing idle connections in the client if any are present. * Set MaxIdleConnsPerHost. Closes #1850 We allow for up to 8 idle connections to a single host. This should make concurrent operation toward the same host reuse connections, even for sizeable concurrency. The number isn't bumped excessively high. We should probably limit concurrency toward a single site anyway, since we'll be able to overrun a site with queries quite easily if we have many concurrent goroutines issuing requests at the same time. * Reinstate driverOptions / useCDP check Use DeMorgan's laws to invert the logic and exit early. Fixes tests breaking. * Documentation fixup. * Use the scraper http.Client when fetching images Fold image fetchers onto the cached scraper http.Client as well. This makes the scraper have a single http.Client cache for all its operations. Thread the client upwards to the relevant attachment points: either the cache, or a stash_box instance, which is extended to include a pointer to the client. Style roughly follows that of txnManagers. * Use the same http Client as the GraphQL client use Rather than using http.DefaultClient, use the same client as the GraphQL client use in the stash_box subsystem. This localizes the client used in the subsystem into the constructing New.. call. * Hoist HTTP client construction Create a function for initializaing the HTTP Client we use. While here hoist magic numbers into constants. Introduce a proper static redirect error and use it in the client code as well. * Reinstate printCookies This is a debugging function, and it might still come in handy in the future at some point. * Nitpick comment. * Minor tidy Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com> --- pkg/scraper/action.go | 10 +++-- pkg/scraper/config_scraper.go | 32 +++++++++------- pkg/scraper/cookies.go | 61 +++++++++++++++++------------- pkg/scraper/freeones.go | 5 ++- pkg/scraper/image.go | 38 +++++++------------ pkg/scraper/json.go | 7 +++- pkg/scraper/scrapers.go | 63 ++++++++++++++++++++++++++----- pkg/scraper/stash.go | 11 ++++-- pkg/scraper/stashbox/stash_box.go | 27 ++++++------- pkg/scraper/url.go | 58 ++++++++++------------------ pkg/scraper/xpath.go | 7 +++- pkg/scraper/xpath_test.go | 3 +- 12 files changed, 180 insertions(+), 142 deletions(-) diff --git a/pkg/scraper/action.go b/pkg/scraper/action.go index 838611252..d8c08da97 100644 --- a/pkg/scraper/action.go +++ b/pkg/scraper/action.go @@ -1,6 +1,8 @@ package scraper import ( + "net/http" + "github.com/stashapp/stash/pkg/models" ) @@ -38,16 +40,16 @@ type scraperActionImpl interface { scrapeMovieByURL(url string) (*models.ScrapedMovie, error) } -func (c config) getScraper(scraper scraperTypeConfig, txnManager models.TransactionManager, globalConfig GlobalConfig) scraperActionImpl { +func (c config) getScraper(scraper scraperTypeConfig, client *http.Client, txnManager models.TransactionManager, globalConfig GlobalConfig) scraperActionImpl { switch scraper.Action { case scraperActionScript: return newScriptScraper(scraper, c, globalConfig) case scraperActionStash: - return newStashScraper(scraper, txnManager, c, globalConfig) + return newStashScraper(scraper, client, txnManager, c, globalConfig) case scraperActionXPath: - return newXpathScraper(scraper, txnManager, c, globalConfig) + return newXpathScraper(scraper, client, txnManager, c, globalConfig) case scraperActionJson: - return newJsonScraper(scraper, txnManager, c, globalConfig) + return newJsonScraper(scraper, client, txnManager, c, globalConfig) } panic("unknown scraper action: " + scraper.Action) diff --git a/pkg/scraper/config_scraper.go b/pkg/scraper/config_scraper.go index b14d819e6..01ccd4f2b 100644 --- a/pkg/scraper/config_scraper.go +++ b/pkg/scraper/config_scraper.go @@ -1,6 +1,10 @@ package scraper -import "github.com/stashapp/stash/pkg/models" +import ( + "net/http" + + "github.com/stashapp/stash/pkg/models" +) type configSceneScraper struct { *configScraper @@ -12,7 +16,7 @@ func (c *configSceneScraper) matchesURL(url string) bool { func (c *configSceneScraper) scrapeByName(name string) ([]*models.ScrapedScene, error) { if c.config.SceneByName != nil { - s := c.config.getScraper(*c.config.SceneByName, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.SceneByName, c.client, c.txnManager, c.globalConfig) return s.scrapeScenesByName(name) } @@ -21,7 +25,7 @@ func (c *configSceneScraper) scrapeByName(name string) ([]*models.ScrapedScene, func (c *configSceneScraper) scrapeByScene(scene *models.Scene) (*models.ScrapedScene, error) { if c.config.SceneByFragment != nil { - s := c.config.getScraper(*c.config.SceneByFragment, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.SceneByFragment, c.client, c.txnManager, c.globalConfig) return s.scrapeSceneByScene(scene) } @@ -30,7 +34,7 @@ func (c *configSceneScraper) scrapeByScene(scene *models.Scene) (*models.Scraped func (c *configSceneScraper) scrapeByFragment(scene models.ScrapedSceneInput) (*models.ScrapedScene, error) { if c.config.SceneByQueryFragment != nil { - s := c.config.getScraper(*c.config.SceneByQueryFragment, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.SceneByQueryFragment, c.client, c.txnManager, c.globalConfig) return s.scrapeSceneByFragment(scene) } @@ -40,7 +44,7 @@ func (c *configSceneScraper) scrapeByFragment(scene models.ScrapedSceneInput) (* func (c *configSceneScraper) scrapeByURL(url string) (*models.ScrapedScene, error) { for _, scraper := range c.config.SceneByURL { if scraper.matchesURL(url) { - s := c.config.getScraper(scraper.scraperTypeConfig, c.txnManager, c.globalConfig) + s := c.config.getScraper(scraper.scraperTypeConfig, c.client, c.txnManager, c.globalConfig) ret, err := s.scrapeSceneByURL(url) if err != nil { return nil, err @@ -65,7 +69,7 @@ func (c *configPerformerScraper) matchesURL(url string) bool { func (c *configPerformerScraper) scrapeByName(name string) ([]*models.ScrapedPerformer, error) { if c.config.PerformerByName != nil { - s := c.config.getScraper(*c.config.PerformerByName, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.PerformerByName, c.client, c.txnManager, c.globalConfig) return s.scrapePerformersByName(name) } @@ -74,7 +78,7 @@ func (c *configPerformerScraper) scrapeByName(name string) ([]*models.ScrapedPer func (c *configPerformerScraper) scrapeByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) { if c.config.PerformerByFragment != nil { - s := c.config.getScraper(*c.config.PerformerByFragment, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.PerformerByFragment, c.client, c.txnManager, c.globalConfig) return s.scrapePerformerByFragment(scrapedPerformer) } @@ -89,7 +93,7 @@ func (c *configPerformerScraper) scrapeByFragment(scrapedPerformer models.Scrape func (c *configPerformerScraper) scrapeByURL(url string) (*models.ScrapedPerformer, error) { for _, scraper := range c.config.PerformerByURL { if scraper.matchesURL(url) { - s := c.config.getScraper(scraper.scraperTypeConfig, c.txnManager, c.globalConfig) + s := c.config.getScraper(scraper.scraperTypeConfig, c.client, c.txnManager, c.globalConfig) ret, err := s.scrapePerformerByURL(url) if err != nil { return nil, err @@ -114,7 +118,7 @@ func (c *configGalleryScraper) matchesURL(url string) bool { func (c *configGalleryScraper) scrapeByGallery(gallery *models.Gallery) (*models.ScrapedGallery, error) { if c.config.GalleryByFragment != nil { - s := c.config.getScraper(*c.config.GalleryByFragment, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.GalleryByFragment, c.client, c.txnManager, c.globalConfig) return s.scrapeGalleryByGallery(gallery) } @@ -124,7 +128,7 @@ func (c *configGalleryScraper) scrapeByGallery(gallery *models.Gallery) (*models func (c *configGalleryScraper) scrapeByFragment(gallery models.ScrapedGalleryInput) (*models.ScrapedGallery, error) { if c.config.GalleryByFragment != nil { // TODO - this should be galleryByQueryFragment - s := c.config.getScraper(*c.config.GalleryByFragment, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.GalleryByFragment, c.client, c.txnManager, c.globalConfig) return s.scrapeGalleryByFragment(gallery) } @@ -134,7 +138,7 @@ func (c *configGalleryScraper) scrapeByFragment(gallery models.ScrapedGalleryInp func (c *configGalleryScraper) scrapeByURL(url string) (*models.ScrapedGallery, error) { for _, scraper := range c.config.GalleryByURL { if scraper.matchesURL(url) { - s := c.config.getScraper(scraper.scraperTypeConfig, c.txnManager, c.globalConfig) + s := c.config.getScraper(scraper.scraperTypeConfig, c.client, c.txnManager, c.globalConfig) ret, err := s.scrapeGalleryByURL(url) if err != nil { return nil, err @@ -160,7 +164,7 @@ func (c *configMovieScraper) matchesURL(url string) bool { func (c *configMovieScraper) scrapeByURL(url string) (*models.ScrapedMovie, error) { for _, scraper := range c.config.MovieByURL { if scraper.matchesURL(url) { - s := c.config.getScraper(scraper.scraperTypeConfig, c.txnManager, c.globalConfig) + s := c.config.getScraper(scraper.scraperTypeConfig, c.client, c.txnManager, c.globalConfig) ret, err := s.scrapeMovieByURL(url) if err != nil { return nil, err @@ -177,12 +181,14 @@ func (c *configMovieScraper) scrapeByURL(url string) (*models.ScrapedMovie, erro type configScraper struct { config config + client *http.Client txnManager models.TransactionManager globalConfig GlobalConfig } -func createScraperFromConfig(c config, txnManager models.TransactionManager, globalConfig GlobalConfig) scraper { +func createScraperFromConfig(c config, client *http.Client, txnManager models.TransactionManager, globalConfig GlobalConfig) scraper { base := configScraper{ + client: client, config: c, txnManager: txnManager, globalConfig: globalConfig, diff --git a/pkg/scraper/cookies.go b/pkg/scraper/cookies.go index 824c64ee6..ed854d50a 100644 --- a/pkg/scraper/cookies.go +++ b/pkg/scraper/cookies.go @@ -11,42 +11,51 @@ import ( "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" + "golang.org/x/net/publicsuffix" "github.com/stashapp/stash/pkg/logger" "github.com/stashapp/stash/pkg/utils" ) -// set cookies for the native http client -func setCookies(jar *cookiejar.Jar, scraperConfig config) { - driverOptions := scraperConfig.DriverOptions - if driverOptions != nil && !driverOptions.UseCDP { +// jar constructs a cookie jar from a configuration +func (c config) jar() (*cookiejar.Jar, error) { + opts := c.DriverOptions + jar, err := cookiejar.New(&cookiejar.Options{ + PublicSuffixList: publicsuffix.List, + }) + if err != nil { + return nil, err + } - for _, ckURL := range driverOptions.Cookies { // go through all cookies - url, err := url.Parse(ckURL.CookieURL) // CookieURL must be valid, include schema - if err != nil { - logger.Warnf("Skipping jar cookies for cookieURL %s. Error %s", ckURL.CookieURL, err) - } else { - var httpCookies []*http.Cookie - var httpCookie *http.Cookie + if opts == nil || opts.UseCDP { + return jar, nil + } - for _, cookie := range ckURL.Cookies { - httpCookie = &http.Cookie{ - Name: cookie.Name, - Value: getCookieValue(cookie), - Path: cookie.Path, - Domain: cookie.Domain, - } + for i, ckURL := range opts.Cookies { + url, err := url.Parse(ckURL.CookieURL) // CookieURL must be valid, include schema + if err != nil { + logger.Warnf("skipping cookie [%d] for cookieURL %s: %v", i, ckURL.CookieURL, err) + continue + } - httpCookies = append(httpCookies, httpCookie) - } - jar.SetCookies(url, httpCookies) // jar.SetCookies only sets cookies with the domain matching the URL - - if jar.Cookies(url) == nil { - logger.Warnf("Setting jar cookies for %s failed", url.String()) - } + var httpCookies []*http.Cookie + for _, cookie := range ckURL.Cookies { + c := &http.Cookie{ + Name: cookie.Name, + Value: getCookieValue(cookie), + Path: cookie.Path, + Domain: cookie.Domain, } + httpCookies = append(httpCookies, c) + } + + jar.SetCookies(url, httpCookies) + if jar.Cookies(url) == nil { + logger.Warnf("setting jar cookies for %s failed", url.String()) } } + + return jar, nil } func getCookieValue(cookie *scraperCookies) string { @@ -56,7 +65,7 @@ func getCookieValue(cookie *scraperCookies) string { return cookie.Value } -// print all cookies from the jar of the native http client +// printCookies prints all cookies from the given cookie jar func printCookies(jar *cookiejar.Jar, scraperConfig config, msg string) { driverOptions := scraperConfig.DriverOptions if driverOptions != nil && !driverOptions.UseCDP { diff --git a/pkg/scraper/freeones.go b/pkg/scraper/freeones.go index d1193048f..c50235cc4 100644 --- a/pkg/scraper/freeones.go +++ b/pkg/scraper/freeones.go @@ -1,6 +1,7 @@ package scraper import ( + "net/http" "strings" "github.com/stashapp/stash/pkg/logger" @@ -123,7 +124,7 @@ xPathScrapers: # Last updated April 13, 2021 ` -func getFreeonesScraper(txnManager models.TransactionManager, globalConfig GlobalConfig) scraper { +func getFreeonesScraper(client *http.Client, txnManager models.TransactionManager, globalConfig GlobalConfig) scraper { yml := freeonesScraperConfig c, err := loadConfigFromYAML(FreeonesScraperID, strings.NewReader(yml)) @@ -131,5 +132,5 @@ func getFreeonesScraper(txnManager models.TransactionManager, globalConfig Globa logger.Fatalf("Error loading builtin freeones scraper: %s", err.Error()) } - return createScraperFromConfig(*c, txnManager, globalConfig) + return createScraperFromConfig(*c, client, txnManager, globalConfig) } diff --git a/pkg/scraper/image.go b/pkg/scraper/image.go index 02376df6f..3954cdbaf 100644 --- a/pkg/scraper/image.go +++ b/pkg/scraper/image.go @@ -2,28 +2,22 @@ package scraper import ( "context" - "crypto/tls" "fmt" "io" "net/http" "strings" - "time" "github.com/stashapp/stash/pkg/models" "github.com/stashapp/stash/pkg/utils" ) -// Timeout to get the image. Includes transfer time. May want to make this -// configurable at some point. -const imageGetTimeout = time.Second * 30 - -func setPerformerImage(ctx context.Context, p *models.ScrapedPerformer, globalConfig GlobalConfig) error { +func setPerformerImage(ctx context.Context, client *http.Client, p *models.ScrapedPerformer, globalConfig GlobalConfig) error { if p == nil || p.Image == nil || !strings.HasPrefix(*p.Image, "http") { // nothing to do return nil } - img, err := getImage(ctx, *p.Image, globalConfig) + img, err := getImage(ctx, *p.Image, client, globalConfig) if err != nil { return err } @@ -35,14 +29,14 @@ func setPerformerImage(ctx context.Context, p *models.ScrapedPerformer, globalCo return nil } -func setSceneImage(ctx context.Context, s *models.ScrapedScene, globalConfig GlobalConfig) error { +func setSceneImage(ctx context.Context, client *http.Client, s *models.ScrapedScene, globalConfig GlobalConfig) error { // don't try to get the image if it doesn't appear to be a URL if s == nil || s.Image == nil || !strings.HasPrefix(*s.Image, "http") { // nothing to do return nil } - img, err := getImage(ctx, *s.Image, globalConfig) + img, err := getImage(ctx, *s.Image, client, globalConfig) if err != nil { return err } @@ -52,14 +46,14 @@ func setSceneImage(ctx context.Context, s *models.ScrapedScene, globalConfig Glo return nil } -func setMovieFrontImage(ctx context.Context, m *models.ScrapedMovie, globalConfig GlobalConfig) error { +func setMovieFrontImage(ctx context.Context, client *http.Client, m *models.ScrapedMovie, globalConfig GlobalConfig) error { // don't try to get the image if it doesn't appear to be a URL if m == nil || m.FrontImage == nil || !strings.HasPrefix(*m.FrontImage, "http") { // nothing to do return nil } - img, err := getImage(ctx, *m.FrontImage, globalConfig) + img, err := getImage(ctx, *m.FrontImage, client, globalConfig) if err != nil { return err } @@ -69,14 +63,14 @@ func setMovieFrontImage(ctx context.Context, m *models.ScrapedMovie, globalConfi return nil } -func setMovieBackImage(ctx context.Context, m *models.ScrapedMovie, globalConfig GlobalConfig) error { +func setMovieBackImage(ctx context.Context, client *http.Client, m *models.ScrapedMovie, globalConfig GlobalConfig) error { // don't try to get the image if it doesn't appear to be a URL if m == nil || m.BackImage == nil || !strings.HasPrefix(*m.BackImage, "http") { // nothing to do return nil } - img, err := getImage(ctx, *m.BackImage, globalConfig) + img, err := getImage(ctx, *m.BackImage, client, globalConfig) if err != nil { return err } @@ -86,13 +80,7 @@ func setMovieBackImage(ctx context.Context, m *models.ScrapedMovie, globalConfig return nil } -func getImage(ctx context.Context, url string, globalConfig GlobalConfig) (*string, error) { - client := &http.Client{ - Transport: &http.Transport{ // ignore insecure certificates - TLSClientConfig: &tls.Config{InsecureSkipVerify: !globalConfig.GetScraperCertCheck()}}, - Timeout: imageGetTimeout, - } - +func getImage(ctx context.Context, url string, client *http.Client, globalConfig GlobalConfig) (*string, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { return nil, err @@ -137,10 +125,10 @@ func getImage(ctx context.Context, url string, globalConfig GlobalConfig) (*stri return &img, nil } -func getStashPerformerImage(ctx context.Context, stashURL string, performerID string, globalConfig GlobalConfig) (*string, error) { - return getImage(ctx, stashURL+"/performer/"+performerID+"/image", globalConfig) +func getStashPerformerImage(ctx context.Context, stashURL string, performerID string, client *http.Client, globalConfig GlobalConfig) (*string, error) { + return getImage(ctx, stashURL+"/performer/"+performerID+"/image", client, globalConfig) } -func getStashSceneImage(ctx context.Context, stashURL string, sceneID string, globalConfig GlobalConfig) (*string, error) { - return getImage(ctx, stashURL+"/scene/"+sceneID+"/screenshot", globalConfig) +func getStashSceneImage(ctx context.Context, stashURL string, sceneID string, client *http.Client, globalConfig GlobalConfig) (*string, error) { + return getImage(ctx, stashURL+"/scene/"+sceneID+"/screenshot", client, globalConfig) } diff --git a/pkg/scraper/json.go b/pkg/scraper/json.go index 1543e18ca..82bf1aa0b 100644 --- a/pkg/scraper/json.go +++ b/pkg/scraper/json.go @@ -4,6 +4,7 @@ import ( "context" "errors" "io" + "net/http" "net/url" "strings" @@ -16,13 +17,15 @@ type jsonScraper struct { scraper scraperTypeConfig config config globalConfig GlobalConfig + client *http.Client txnManager models.TransactionManager } -func newJsonScraper(scraper scraperTypeConfig, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *jsonScraper { +func newJsonScraper(scraper scraperTypeConfig, client *http.Client, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *jsonScraper { return &jsonScraper{ scraper: scraper, config: config, + client: client, globalConfig: globalConfig, txnManager: txnManager, } @@ -49,7 +52,7 @@ func (s *jsonScraper) scrapeURL(ctx context.Context, url string) (string, *mappe } func (s *jsonScraper) loadURL(ctx context.Context, url string) (string, error) { - r, err := loadURL(ctx, url, s.config, s.globalConfig) + r, err := loadURL(ctx, url, s.client, s.config, s.globalConfig) if err != nil { return "", err } diff --git a/pkg/scraper/scrapers.go b/pkg/scraper/scrapers.go index 82795bb0e..6c3d16d9c 100644 --- a/pkg/scraper/scrapers.go +++ b/pkg/scraper/scrapers.go @@ -2,11 +2,15 @@ package scraper import ( "context" + "crypto/tls" "errors" + "fmt" + "net/http" "os" "path/filepath" "regexp" "strings" + "time" "github.com/stashapp/stash/pkg/logger" stash_config "github.com/stashapp/stash/pkg/manager/config" @@ -15,6 +19,22 @@ import ( "github.com/stashapp/stash/pkg/utils" ) +var ErrMaxRedirects = errors.New("maximum number of HTTP redirects reached") + +const ( + // scrapeGetTimeout is the timeout for scraper HTTP requests. Includes transfer time. + // We may want to bump this at some point and use local context-timeouts if more granularity + // is needed. + scrapeGetTimeout = time.Second * 60 + + // maxIdleConnsPerHost is the maximum number of idle connections the HTTP client will + // keep on a per-host basis. + maxIdleConnsPerHost = 8 + + // maxRedirects defines the maximum number of redirects the HTTP client will follow + maxRedirects = 20 +) + // GlobalConfig contains the global scraper options. type GlobalConfig interface { GetScraperUserAgent() string @@ -33,11 +53,32 @@ func isCDPPathWS(c GlobalConfig) bool { // Cache stores scraper details. type Cache struct { + client *http.Client scrapers []scraper globalConfig GlobalConfig txnManager models.TransactionManager } +// newClient creates a scraper-local http client we use throughout the scraper subsystem. +func newClient(gc GlobalConfig) *http.Client { + client := &http.Client{ + Transport: &http.Transport{ // ignore insecure certificates + TLSClientConfig: &tls.Config{InsecureSkipVerify: !gc.GetScraperCertCheck()}, + MaxIdleConnsPerHost: maxIdleConnsPerHost, + }, + Timeout: scrapeGetTimeout, + // defaultCheckRedirect code with max changed from 10 to maxRedirects + CheckRedirect: func(req *http.Request, via []*http.Request) error { + if len(via) >= maxRedirects { + return fmt.Errorf("after %d redirects: %w", maxRedirects, ErrMaxRedirects) + } + return nil + }, + } + + return client +} + // NewCache returns a new Cache loading scraper configurations from the // scraper path provided in the global config object. It returns a new // instance and an error if the scraper directory could not be loaded. @@ -45,19 +86,23 @@ type Cache struct { // Scraper configurations are loaded from yml files in the provided scrapers // directory and any subdirectories. func NewCache(globalConfig GlobalConfig, txnManager models.TransactionManager) (*Cache, error) { - scrapers, err := loadScrapers(globalConfig, txnManager) + // HTTP Client setup + client := newClient(globalConfig) + + scrapers, err := loadScrapers(globalConfig, client, txnManager) if err != nil { return nil, err } return &Cache{ + client: client, globalConfig: globalConfig, scrapers: scrapers, txnManager: txnManager, }, nil } -func loadScrapers(globalConfig GlobalConfig, txnManager models.TransactionManager) ([]scraper, error) { +func loadScrapers(globalConfig GlobalConfig, client *http.Client, txnManager models.TransactionManager) ([]scraper, error) { path := globalConfig.GetScrapersPath() scrapers := make([]scraper, 0) @@ -76,14 +121,14 @@ func loadScrapers(globalConfig GlobalConfig, txnManager models.TransactionManage } // add built-in freeones scraper - scrapers = append(scrapers, getFreeonesScraper(txnManager, globalConfig), getAutoTagScraper(txnManager, globalConfig)) + scrapers = append(scrapers, getFreeonesScraper(client, txnManager, globalConfig), getAutoTagScraper(txnManager, globalConfig)) for _, file := range scraperFiles { c, err := loadConfigFromYAMLFile(file) if err != nil { logger.Errorf("Error loading scraper %s: %s", file, err.Error()) } else { - scraper := createScraperFromConfig(*c, txnManager, globalConfig) + scraper := createScraperFromConfig(*c, client, txnManager, globalConfig) scrapers = append(scrapers, scraper) } } @@ -95,7 +140,7 @@ func loadScrapers(globalConfig GlobalConfig, txnManager models.TransactionManage // In the event of an error during loading, the cache will be left empty. func (c *Cache) ReloadScrapers() error { c.scrapers = nil - scrapers, err := loadScrapers(c.globalConfig, c.txnManager) + scrapers, err := loadScrapers(c.globalConfig, c.client, c.txnManager) if err != nil { return err } @@ -255,7 +300,7 @@ func (c Cache) postScrapePerformer(ctx context.Context, ret *models.ScrapedPerfo } // post-process - set the image if applicable - if err := setPerformerImage(ctx, ret, c.globalConfig); err != nil { + if err := setPerformerImage(ctx, c.client, ret, c.globalConfig); err != nil { logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error()) } @@ -323,7 +368,7 @@ func (c Cache) postScrapeScene(ctx context.Context, ret *models.ScrapedScene) er } // post-process - set the image if applicable - if err := setSceneImage(ctx, ret, c.globalConfig); err != nil { + if err := setSceneImage(ctx, c.client, ret, c.globalConfig); err != nil { logger.Warnf("Could not set image using URL %s: %v", *ret.Image, err) } @@ -551,10 +596,10 @@ func (c Cache) ScrapeMovieURL(url string) (*models.ScrapedMovie, error) { } // post-process - set the image if applicable - if err := setMovieFrontImage(context.TODO(), ret, c.globalConfig); err != nil { + if err := setMovieFrontImage(context.TODO(), c.client, ret, c.globalConfig); err != nil { logger.Warnf("Could not set front image using URL %s: %s", *ret.FrontImage, err.Error()) } - if err := setMovieBackImage(context.TODO(), ret, c.globalConfig); err != nil { + if err := setMovieBackImage(context.TODO(), c.client, ret, c.globalConfig); err != nil { logger.Warnf("Could not set back image using URL %s: %s", *ret.BackImage, err.Error()) } diff --git a/pkg/scraper/stash.go b/pkg/scraper/stash.go index df83d5fa4..50f5cc12a 100644 --- a/pkg/scraper/stash.go +++ b/pkg/scraper/stash.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "errors" + "net/http" "strconv" "github.com/jinzhu/copier" @@ -16,13 +17,15 @@ type stashScraper struct { scraper scraperTypeConfig config config globalConfig GlobalConfig + client *http.Client txnManager models.TransactionManager } -func newStashScraper(scraper scraperTypeConfig, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *stashScraper { +func newStashScraper(scraper scraperTypeConfig, client *http.Client, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *stashScraper { return &stashScraper{ scraper: scraper, config: config, + client: client, globalConfig: globalConfig, txnManager: txnManager, } @@ -138,7 +141,7 @@ func (s *stashScraper) scrapePerformerByFragment(scrapedPerformer models.Scraped } // get the performer image directly - ret.Image, err = getStashPerformerImage(context.TODO(), s.config.StashServer.URL, performerID, s.globalConfig) + ret.Image, err = getStashPerformerImage(context.TODO(), s.config.StashServer.URL, performerID, s.client, s.globalConfig) if err != nil { return nil, err } @@ -164,7 +167,7 @@ func (s *stashScraper) scrapedStashSceneToScrapedScene(scene *scrapedSceneStash) } // get the performer image directly - ret.Image, err = getStashSceneImage(context.TODO(), s.config.StashServer.URL, scene.ID, s.globalConfig) + ret.Image, err = getStashSceneImage(context.TODO(), s.config.StashServer.URL, scene.ID, s.client, s.globalConfig) if err != nil { return nil, err } @@ -251,7 +254,7 @@ func (s *stashScraper) scrapeSceneByScene(scene *models.Scene) (*models.ScrapedS } // get the performer image directly - ret.Image, err = getStashSceneImage(context.TODO(), s.config.StashServer.URL, q.FindScene.ID, s.globalConfig) + ret.Image, err = getStashSceneImage(context.TODO(), s.config.StashServer.URL, q.FindScene.ID, s.client, s.globalConfig) if err != nil { return nil, err } diff --git a/pkg/scraper/stashbox/stash_box.go b/pkg/scraper/stashbox/stash_box.go index 71dc1b109..7556b9426 100644 --- a/pkg/scraper/stashbox/stash_box.go +++ b/pkg/scraper/stashbox/stash_box.go @@ -7,7 +7,6 @@ import ( "net/http" "strconv" "strings" - "time" "github.com/Yamashou/gqlgenc/client" @@ -18,10 +17,6 @@ import ( "github.com/stashapp/stash/pkg/utils" ) -// Timeout to get the image. Includes transfer time. May want to make this -// configurable at some point. -const imageGetTimeout = time.Second * 30 - // Client represents the client interface to a stash-box server instance. type Client struct { client *graphql.Client @@ -44,6 +39,10 @@ func NewClient(box models.StashBox, txnManager models.TransactionManager) *Clien } } +func (c Client) getHTTPClient() *http.Client { + return c.client.Client.Client +} + // QueryStashBoxScene queries stash-box for scenes using a query string. func (c Client) QueryStashBoxScene(ctx context.Context, queryStr string) ([]*models.ScrapedScene, error) { scenes, err := c.client.SearchScene(ctx, queryStr) @@ -55,7 +54,7 @@ func (c Client) QueryStashBoxScene(ctx context.Context, queryStr string) ([]*mod var ret []*models.ScrapedScene for _, s := range sceneFragments { - ss, err := sceneFragmentToScrapedScene(context.TODO(), c.txnManager, s) + ss, err := sceneFragmentToScrapedScene(context.TODO(), c.getHTTPClient(), c.txnManager, s) if err != nil { return nil, err } @@ -201,7 +200,7 @@ func (c Client) findStashBoxScenesByFingerprints(ctx context.Context, fingerprin sceneFragments := scenes.FindScenesByFingerprints for _, s := range sceneFragments { - ss, err := sceneFragmentToScrapedScene(ctx, c.txnManager, s) + ss, err := sceneFragmentToScrapedScene(ctx, c.getHTTPClient(), c.txnManager, s) if err != nil { return nil, err } @@ -509,11 +508,7 @@ func formatBodyModifications(m []*graphql.BodyModificationFragment) *string { return &ret } -func fetchImage(ctx context.Context, url string) (*string, error) { - client := &http.Client{ - Timeout: imageGetTimeout, - } - +func fetchImage(ctx context.Context, client *http.Client, url string) (*string, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { return nil, err @@ -595,8 +590,8 @@ func performerFragmentToScrapedScenePerformer(p graphql.PerformerFragment) *mode return sp } -func getFirstImage(ctx context.Context, images []*graphql.ImageFragment) *string { - ret, err := fetchImage(ctx, images[0].URL) +func getFirstImage(ctx context.Context, client *http.Client, images []*graphql.ImageFragment) *string { + ret, err := fetchImage(ctx, client, images[0].URL) if err != nil { logger.Warnf("Error fetching image %s: %s", images[0].URL, err.Error()) } @@ -617,7 +612,7 @@ func getFingerprints(scene *graphql.SceneFragment) []*models.StashBoxFingerprint return fingerprints } -func sceneFragmentToScrapedScene(ctx context.Context, txnManager models.TransactionManager, s *graphql.SceneFragment) (*models.ScrapedScene, error) { +func sceneFragmentToScrapedScene(ctx context.Context, client *http.Client, txnManager models.TransactionManager, s *graphql.SceneFragment) (*models.ScrapedScene, error) { stashID := s.ID ss := &models.ScrapedScene{ Title: s.Title, @@ -634,7 +629,7 @@ func sceneFragmentToScrapedScene(ctx context.Context, txnManager models.Transact if len(s.Images) > 0 { // TODO - #454 code sorts images by aspect ratio according to a wanted // orientation. I'm just grabbing the first for now - ss.Image = getFirstImage(ctx, s.Images) + ss.Image = getFirstImage(ctx, client, s.Images) } if err := txnManager.WithReadTxn(ctx, func(r models.ReaderRepository) error { diff --git a/pkg/scraper/url.go b/pkg/scraper/url.go index 516c26387..b07722d3f 100644 --- a/pkg/scraper/url.go +++ b/pkg/scraper/url.go @@ -3,12 +3,10 @@ package scraper import ( "bytes" "context" - "crypto/tls" - "errors" "fmt" "io" "net/http" - "net/http/cookiejar" + "net/url" "os" "strings" "time" @@ -18,55 +16,40 @@ import ( "github.com/chromedp/chromedp" jsoniter "github.com/json-iterator/go" "golang.org/x/net/html/charset" - "golang.org/x/net/publicsuffix" "github.com/stashapp/stash/pkg/logger" ) -// Timeout for the scrape http request. Includes transfer time. May want to make this -// configurable at some point. -const scrapeGetTimeout = time.Second * 60 const scrapeDefaultSleep = time.Second * 2 -func loadURL(ctx context.Context, url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) { +func loadURL(ctx context.Context, loadURL string, client *http.Client, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) { driverOptions := scraperConfig.DriverOptions if driverOptions != nil && driverOptions.UseCDP { // get the page using chrome dp - return urlFromCDP(ctx, url, *driverOptions, globalConfig) + return urlFromCDP(ctx, loadURL, *driverOptions, globalConfig) } - // get the page using http.Client - options := cookiejar.Options{ - PublicSuffixList: publicsuffix.List, - } - jar, er := cookiejar.New(&options) - if er != nil { - return nil, er - } - - setCookies(jar, scraperConfig) - printCookies(jar, scraperConfig, "Jar cookies set from scraper") - - client := &http.Client{ - Transport: &http.Transport{ // ignore insecure certificates - TLSClientConfig: &tls.Config{InsecureSkipVerify: !globalConfig.GetScraperCertCheck()}, - }, - Timeout: scrapeGetTimeout, - // defaultCheckRedirect code with max changed from 10 to 20 - CheckRedirect: func(req *http.Request, via []*http.Request) error { - if len(via) >= 20 { - return errors.New("stopped after 20 redirects") - } - return nil - }, - Jar: jar, - } - - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, loadURL, nil) if err != nil { return nil, err } + jar, err := scraperConfig.jar() + if err != nil { + return nil, fmt.Errorf("error creating cookie jar: %w", err) + } + + u, err := url.Parse(loadURL) + if err != nil { + return nil, fmt.Errorf("error parsing url %s: %w", loadURL, err) + } + + // Fetch relevant cookies from the jar for url u and add them to the request + cookies := jar.Cookies(u) + for _, cookie := range cookies { + req.AddCookie(cookie) + } + userAgent := globalConfig.GetScraperUserAgent() if userAgent != "" { req.Header.Set("User-Agent", userAgent) @@ -98,7 +81,6 @@ func loadURL(ctx context.Context, url string, scraperConfig config, globalConfig bodyReader := bytes.NewReader(body) printCookies(jar, scraperConfig, "Jar cookies found for scraper urls") - return charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) } diff --git a/pkg/scraper/xpath.go b/pkg/scraper/xpath.go index 5093bd099..0f820a4cd 100644 --- a/pkg/scraper/xpath.go +++ b/pkg/scraper/xpath.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "errors" + "net/http" "net/url" "regexp" "strings" @@ -20,14 +21,16 @@ type xpathScraper struct { scraper scraperTypeConfig config config globalConfig GlobalConfig + client *http.Client txnManager models.TransactionManager } -func newXpathScraper(scraper scraperTypeConfig, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *xpathScraper { +func newXpathScraper(scraper scraperTypeConfig, client *http.Client, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *xpathScraper { return &xpathScraper{ scraper: scraper, config: config, globalConfig: globalConfig, + client: client, txnManager: txnManager, } } @@ -227,7 +230,7 @@ func (s *xpathScraper) scrapeGalleryByFragment(gallery models.ScrapedGalleryInpu } func (s *xpathScraper) loadURL(ctx context.Context, url string) (*html.Node, error) { - r, err := loadURL(ctx, url, s.config, s.globalConfig) + r, err := loadURL(ctx, url, s.client, s.config, s.globalConfig) if err != nil { return nil, err } diff --git a/pkg/scraper/xpath_test.go b/pkg/scraper/xpath_test.go index fbffcc390..ff01741b7 100644 --- a/pkg/scraper/xpath_test.go +++ b/pkg/scraper/xpath_test.go @@ -874,7 +874,8 @@ xPathScrapers: globalConfig := mockGlobalConfig{} - s := createScraperFromConfig(*c, nil, globalConfig) + client := &http.Client{} + s := createScraperFromConfig(*c, client, nil, globalConfig) performer, err := s.Performer.scrapeByURL(ts.URL) if err != nil {