diff --git a/pkg/scraper/action.go b/pkg/scraper/action.go index 838611252..d8c08da97 100644 --- a/pkg/scraper/action.go +++ b/pkg/scraper/action.go @@ -1,6 +1,8 @@ package scraper import ( + "net/http" + "github.com/stashapp/stash/pkg/models" ) @@ -38,16 +40,16 @@ type scraperActionImpl interface { scrapeMovieByURL(url string) (*models.ScrapedMovie, error) } -func (c config) getScraper(scraper scraperTypeConfig, txnManager models.TransactionManager, globalConfig GlobalConfig) scraperActionImpl { +func (c config) getScraper(scraper scraperTypeConfig, client *http.Client, txnManager models.TransactionManager, globalConfig GlobalConfig) scraperActionImpl { switch scraper.Action { case scraperActionScript: return newScriptScraper(scraper, c, globalConfig) case scraperActionStash: - return newStashScraper(scraper, txnManager, c, globalConfig) + return newStashScraper(scraper, client, txnManager, c, globalConfig) case scraperActionXPath: - return newXpathScraper(scraper, txnManager, c, globalConfig) + return newXpathScraper(scraper, client, txnManager, c, globalConfig) case scraperActionJson: - return newJsonScraper(scraper, txnManager, c, globalConfig) + return newJsonScraper(scraper, client, txnManager, c, globalConfig) } panic("unknown scraper action: " + scraper.Action) diff --git a/pkg/scraper/config_scraper.go b/pkg/scraper/config_scraper.go index b14d819e6..01ccd4f2b 100644 --- a/pkg/scraper/config_scraper.go +++ b/pkg/scraper/config_scraper.go @@ -1,6 +1,10 @@ package scraper -import "github.com/stashapp/stash/pkg/models" +import ( + "net/http" + + "github.com/stashapp/stash/pkg/models" +) type configSceneScraper struct { *configScraper @@ -12,7 +16,7 @@ func (c *configSceneScraper) matchesURL(url string) bool { func (c *configSceneScraper) scrapeByName(name string) ([]*models.ScrapedScene, error) { if c.config.SceneByName != nil { - s := c.config.getScraper(*c.config.SceneByName, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.SceneByName, c.client, c.txnManager, c.globalConfig) return s.scrapeScenesByName(name) } @@ -21,7 +25,7 @@ func (c *configSceneScraper) scrapeByName(name string) ([]*models.ScrapedScene, func (c *configSceneScraper) scrapeByScene(scene *models.Scene) (*models.ScrapedScene, error) { if c.config.SceneByFragment != nil { - s := c.config.getScraper(*c.config.SceneByFragment, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.SceneByFragment, c.client, c.txnManager, c.globalConfig) return s.scrapeSceneByScene(scene) } @@ -30,7 +34,7 @@ func (c *configSceneScraper) scrapeByScene(scene *models.Scene) (*models.Scraped func (c *configSceneScraper) scrapeByFragment(scene models.ScrapedSceneInput) (*models.ScrapedScene, error) { if c.config.SceneByQueryFragment != nil { - s := c.config.getScraper(*c.config.SceneByQueryFragment, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.SceneByQueryFragment, c.client, c.txnManager, c.globalConfig) return s.scrapeSceneByFragment(scene) } @@ -40,7 +44,7 @@ func (c *configSceneScraper) scrapeByFragment(scene models.ScrapedSceneInput) (* func (c *configSceneScraper) scrapeByURL(url string) (*models.ScrapedScene, error) { for _, scraper := range c.config.SceneByURL { if scraper.matchesURL(url) { - s := c.config.getScraper(scraper.scraperTypeConfig, c.txnManager, c.globalConfig) + s := c.config.getScraper(scraper.scraperTypeConfig, c.client, c.txnManager, c.globalConfig) ret, err := s.scrapeSceneByURL(url) if err != nil { return nil, err @@ -65,7 +69,7 @@ func (c *configPerformerScraper) matchesURL(url string) bool { func (c *configPerformerScraper) scrapeByName(name string) ([]*models.ScrapedPerformer, error) { if c.config.PerformerByName != nil { - s := c.config.getScraper(*c.config.PerformerByName, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.PerformerByName, c.client, c.txnManager, c.globalConfig) return s.scrapePerformersByName(name) } @@ -74,7 +78,7 @@ func (c *configPerformerScraper) scrapeByName(name string) ([]*models.ScrapedPer func (c *configPerformerScraper) scrapeByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) { if c.config.PerformerByFragment != nil { - s := c.config.getScraper(*c.config.PerformerByFragment, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.PerformerByFragment, c.client, c.txnManager, c.globalConfig) return s.scrapePerformerByFragment(scrapedPerformer) } @@ -89,7 +93,7 @@ func (c *configPerformerScraper) scrapeByFragment(scrapedPerformer models.Scrape func (c *configPerformerScraper) scrapeByURL(url string) (*models.ScrapedPerformer, error) { for _, scraper := range c.config.PerformerByURL { if scraper.matchesURL(url) { - s := c.config.getScraper(scraper.scraperTypeConfig, c.txnManager, c.globalConfig) + s := c.config.getScraper(scraper.scraperTypeConfig, c.client, c.txnManager, c.globalConfig) ret, err := s.scrapePerformerByURL(url) if err != nil { return nil, err @@ -114,7 +118,7 @@ func (c *configGalleryScraper) matchesURL(url string) bool { func (c *configGalleryScraper) scrapeByGallery(gallery *models.Gallery) (*models.ScrapedGallery, error) { if c.config.GalleryByFragment != nil { - s := c.config.getScraper(*c.config.GalleryByFragment, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.GalleryByFragment, c.client, c.txnManager, c.globalConfig) return s.scrapeGalleryByGallery(gallery) } @@ -124,7 +128,7 @@ func (c *configGalleryScraper) scrapeByGallery(gallery *models.Gallery) (*models func (c *configGalleryScraper) scrapeByFragment(gallery models.ScrapedGalleryInput) (*models.ScrapedGallery, error) { if c.config.GalleryByFragment != nil { // TODO - this should be galleryByQueryFragment - s := c.config.getScraper(*c.config.GalleryByFragment, c.txnManager, c.globalConfig) + s := c.config.getScraper(*c.config.GalleryByFragment, c.client, c.txnManager, c.globalConfig) return s.scrapeGalleryByFragment(gallery) } @@ -134,7 +138,7 @@ func (c *configGalleryScraper) scrapeByFragment(gallery models.ScrapedGalleryInp func (c *configGalleryScraper) scrapeByURL(url string) (*models.ScrapedGallery, error) { for _, scraper := range c.config.GalleryByURL { if scraper.matchesURL(url) { - s := c.config.getScraper(scraper.scraperTypeConfig, c.txnManager, c.globalConfig) + s := c.config.getScraper(scraper.scraperTypeConfig, c.client, c.txnManager, c.globalConfig) ret, err := s.scrapeGalleryByURL(url) if err != nil { return nil, err @@ -160,7 +164,7 @@ func (c *configMovieScraper) matchesURL(url string) bool { func (c *configMovieScraper) scrapeByURL(url string) (*models.ScrapedMovie, error) { for _, scraper := range c.config.MovieByURL { if scraper.matchesURL(url) { - s := c.config.getScraper(scraper.scraperTypeConfig, c.txnManager, c.globalConfig) + s := c.config.getScraper(scraper.scraperTypeConfig, c.client, c.txnManager, c.globalConfig) ret, err := s.scrapeMovieByURL(url) if err != nil { return nil, err @@ -177,12 +181,14 @@ func (c *configMovieScraper) scrapeByURL(url string) (*models.ScrapedMovie, erro type configScraper struct { config config + client *http.Client txnManager models.TransactionManager globalConfig GlobalConfig } -func createScraperFromConfig(c config, txnManager models.TransactionManager, globalConfig GlobalConfig) scraper { +func createScraperFromConfig(c config, client *http.Client, txnManager models.TransactionManager, globalConfig GlobalConfig) scraper { base := configScraper{ + client: client, config: c, txnManager: txnManager, globalConfig: globalConfig, diff --git a/pkg/scraper/cookies.go b/pkg/scraper/cookies.go index 824c64ee6..ed854d50a 100644 --- a/pkg/scraper/cookies.go +++ b/pkg/scraper/cookies.go @@ -11,42 +11,51 @@ import ( "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" + "golang.org/x/net/publicsuffix" "github.com/stashapp/stash/pkg/logger" "github.com/stashapp/stash/pkg/utils" ) -// set cookies for the native http client -func setCookies(jar *cookiejar.Jar, scraperConfig config) { - driverOptions := scraperConfig.DriverOptions - if driverOptions != nil && !driverOptions.UseCDP { +// jar constructs a cookie jar from a configuration +func (c config) jar() (*cookiejar.Jar, error) { + opts := c.DriverOptions + jar, err := cookiejar.New(&cookiejar.Options{ + PublicSuffixList: publicsuffix.List, + }) + if err != nil { + return nil, err + } - for _, ckURL := range driverOptions.Cookies { // go through all cookies - url, err := url.Parse(ckURL.CookieURL) // CookieURL must be valid, include schema - if err != nil { - logger.Warnf("Skipping jar cookies for cookieURL %s. Error %s", ckURL.CookieURL, err) - } else { - var httpCookies []*http.Cookie - var httpCookie *http.Cookie + if opts == nil || opts.UseCDP { + return jar, nil + } - for _, cookie := range ckURL.Cookies { - httpCookie = &http.Cookie{ - Name: cookie.Name, - Value: getCookieValue(cookie), - Path: cookie.Path, - Domain: cookie.Domain, - } + for i, ckURL := range opts.Cookies { + url, err := url.Parse(ckURL.CookieURL) // CookieURL must be valid, include schema + if err != nil { + logger.Warnf("skipping cookie [%d] for cookieURL %s: %v", i, ckURL.CookieURL, err) + continue + } - httpCookies = append(httpCookies, httpCookie) - } - jar.SetCookies(url, httpCookies) // jar.SetCookies only sets cookies with the domain matching the URL - - if jar.Cookies(url) == nil { - logger.Warnf("Setting jar cookies for %s failed", url.String()) - } + var httpCookies []*http.Cookie + for _, cookie := range ckURL.Cookies { + c := &http.Cookie{ + Name: cookie.Name, + Value: getCookieValue(cookie), + Path: cookie.Path, + Domain: cookie.Domain, } + httpCookies = append(httpCookies, c) + } + + jar.SetCookies(url, httpCookies) + if jar.Cookies(url) == nil { + logger.Warnf("setting jar cookies for %s failed", url.String()) } } + + return jar, nil } func getCookieValue(cookie *scraperCookies) string { @@ -56,7 +65,7 @@ func getCookieValue(cookie *scraperCookies) string { return cookie.Value } -// print all cookies from the jar of the native http client +// printCookies prints all cookies from the given cookie jar func printCookies(jar *cookiejar.Jar, scraperConfig config, msg string) { driverOptions := scraperConfig.DriverOptions if driverOptions != nil && !driverOptions.UseCDP { diff --git a/pkg/scraper/freeones.go b/pkg/scraper/freeones.go index d1193048f..c50235cc4 100644 --- a/pkg/scraper/freeones.go +++ b/pkg/scraper/freeones.go @@ -1,6 +1,7 @@ package scraper import ( + "net/http" "strings" "github.com/stashapp/stash/pkg/logger" @@ -123,7 +124,7 @@ xPathScrapers: # Last updated April 13, 2021 ` -func getFreeonesScraper(txnManager models.TransactionManager, globalConfig GlobalConfig) scraper { +func getFreeonesScraper(client *http.Client, txnManager models.TransactionManager, globalConfig GlobalConfig) scraper { yml := freeonesScraperConfig c, err := loadConfigFromYAML(FreeonesScraperID, strings.NewReader(yml)) @@ -131,5 +132,5 @@ func getFreeonesScraper(txnManager models.TransactionManager, globalConfig Globa logger.Fatalf("Error loading builtin freeones scraper: %s", err.Error()) } - return createScraperFromConfig(*c, txnManager, globalConfig) + return createScraperFromConfig(*c, client, txnManager, globalConfig) } diff --git a/pkg/scraper/image.go b/pkg/scraper/image.go index 02376df6f..3954cdbaf 100644 --- a/pkg/scraper/image.go +++ b/pkg/scraper/image.go @@ -2,28 +2,22 @@ package scraper import ( "context" - "crypto/tls" "fmt" "io" "net/http" "strings" - "time" "github.com/stashapp/stash/pkg/models" "github.com/stashapp/stash/pkg/utils" ) -// Timeout to get the image. Includes transfer time. May want to make this -// configurable at some point. -const imageGetTimeout = time.Second * 30 - -func setPerformerImage(ctx context.Context, p *models.ScrapedPerformer, globalConfig GlobalConfig) error { +func setPerformerImage(ctx context.Context, client *http.Client, p *models.ScrapedPerformer, globalConfig GlobalConfig) error { if p == nil || p.Image == nil || !strings.HasPrefix(*p.Image, "http") { // nothing to do return nil } - img, err := getImage(ctx, *p.Image, globalConfig) + img, err := getImage(ctx, *p.Image, client, globalConfig) if err != nil { return err } @@ -35,14 +29,14 @@ func setPerformerImage(ctx context.Context, p *models.ScrapedPerformer, globalCo return nil } -func setSceneImage(ctx context.Context, s *models.ScrapedScene, globalConfig GlobalConfig) error { +func setSceneImage(ctx context.Context, client *http.Client, s *models.ScrapedScene, globalConfig GlobalConfig) error { // don't try to get the image if it doesn't appear to be a URL if s == nil || s.Image == nil || !strings.HasPrefix(*s.Image, "http") { // nothing to do return nil } - img, err := getImage(ctx, *s.Image, globalConfig) + img, err := getImage(ctx, *s.Image, client, globalConfig) if err != nil { return err } @@ -52,14 +46,14 @@ func setSceneImage(ctx context.Context, s *models.ScrapedScene, globalConfig Glo return nil } -func setMovieFrontImage(ctx context.Context, m *models.ScrapedMovie, globalConfig GlobalConfig) error { +func setMovieFrontImage(ctx context.Context, client *http.Client, m *models.ScrapedMovie, globalConfig GlobalConfig) error { // don't try to get the image if it doesn't appear to be a URL if m == nil || m.FrontImage == nil || !strings.HasPrefix(*m.FrontImage, "http") { // nothing to do return nil } - img, err := getImage(ctx, *m.FrontImage, globalConfig) + img, err := getImage(ctx, *m.FrontImage, client, globalConfig) if err != nil { return err } @@ -69,14 +63,14 @@ func setMovieFrontImage(ctx context.Context, m *models.ScrapedMovie, globalConfi return nil } -func setMovieBackImage(ctx context.Context, m *models.ScrapedMovie, globalConfig GlobalConfig) error { +func setMovieBackImage(ctx context.Context, client *http.Client, m *models.ScrapedMovie, globalConfig GlobalConfig) error { // don't try to get the image if it doesn't appear to be a URL if m == nil || m.BackImage == nil || !strings.HasPrefix(*m.BackImage, "http") { // nothing to do return nil } - img, err := getImage(ctx, *m.BackImage, globalConfig) + img, err := getImage(ctx, *m.BackImage, client, globalConfig) if err != nil { return err } @@ -86,13 +80,7 @@ func setMovieBackImage(ctx context.Context, m *models.ScrapedMovie, globalConfig return nil } -func getImage(ctx context.Context, url string, globalConfig GlobalConfig) (*string, error) { - client := &http.Client{ - Transport: &http.Transport{ // ignore insecure certificates - TLSClientConfig: &tls.Config{InsecureSkipVerify: !globalConfig.GetScraperCertCheck()}}, - Timeout: imageGetTimeout, - } - +func getImage(ctx context.Context, url string, client *http.Client, globalConfig GlobalConfig) (*string, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { return nil, err @@ -137,10 +125,10 @@ func getImage(ctx context.Context, url string, globalConfig GlobalConfig) (*stri return &img, nil } -func getStashPerformerImage(ctx context.Context, stashURL string, performerID string, globalConfig GlobalConfig) (*string, error) { - return getImage(ctx, stashURL+"/performer/"+performerID+"/image", globalConfig) +func getStashPerformerImage(ctx context.Context, stashURL string, performerID string, client *http.Client, globalConfig GlobalConfig) (*string, error) { + return getImage(ctx, stashURL+"/performer/"+performerID+"/image", client, globalConfig) } -func getStashSceneImage(ctx context.Context, stashURL string, sceneID string, globalConfig GlobalConfig) (*string, error) { - return getImage(ctx, stashURL+"/scene/"+sceneID+"/screenshot", globalConfig) +func getStashSceneImage(ctx context.Context, stashURL string, sceneID string, client *http.Client, globalConfig GlobalConfig) (*string, error) { + return getImage(ctx, stashURL+"/scene/"+sceneID+"/screenshot", client, globalConfig) } diff --git a/pkg/scraper/json.go b/pkg/scraper/json.go index 1543e18ca..82bf1aa0b 100644 --- a/pkg/scraper/json.go +++ b/pkg/scraper/json.go @@ -4,6 +4,7 @@ import ( "context" "errors" "io" + "net/http" "net/url" "strings" @@ -16,13 +17,15 @@ type jsonScraper struct { scraper scraperTypeConfig config config globalConfig GlobalConfig + client *http.Client txnManager models.TransactionManager } -func newJsonScraper(scraper scraperTypeConfig, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *jsonScraper { +func newJsonScraper(scraper scraperTypeConfig, client *http.Client, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *jsonScraper { return &jsonScraper{ scraper: scraper, config: config, + client: client, globalConfig: globalConfig, txnManager: txnManager, } @@ -49,7 +52,7 @@ func (s *jsonScraper) scrapeURL(ctx context.Context, url string) (string, *mappe } func (s *jsonScraper) loadURL(ctx context.Context, url string) (string, error) { - r, err := loadURL(ctx, url, s.config, s.globalConfig) + r, err := loadURL(ctx, url, s.client, s.config, s.globalConfig) if err != nil { return "", err } diff --git a/pkg/scraper/scrapers.go b/pkg/scraper/scrapers.go index 82795bb0e..6c3d16d9c 100644 --- a/pkg/scraper/scrapers.go +++ b/pkg/scraper/scrapers.go @@ -2,11 +2,15 @@ package scraper import ( "context" + "crypto/tls" "errors" + "fmt" + "net/http" "os" "path/filepath" "regexp" "strings" + "time" "github.com/stashapp/stash/pkg/logger" stash_config "github.com/stashapp/stash/pkg/manager/config" @@ -15,6 +19,22 @@ import ( "github.com/stashapp/stash/pkg/utils" ) +var ErrMaxRedirects = errors.New("maximum number of HTTP redirects reached") + +const ( + // scrapeGetTimeout is the timeout for scraper HTTP requests. Includes transfer time. + // We may want to bump this at some point and use local context-timeouts if more granularity + // is needed. + scrapeGetTimeout = time.Second * 60 + + // maxIdleConnsPerHost is the maximum number of idle connections the HTTP client will + // keep on a per-host basis. + maxIdleConnsPerHost = 8 + + // maxRedirects defines the maximum number of redirects the HTTP client will follow + maxRedirects = 20 +) + // GlobalConfig contains the global scraper options. type GlobalConfig interface { GetScraperUserAgent() string @@ -33,11 +53,32 @@ func isCDPPathWS(c GlobalConfig) bool { // Cache stores scraper details. type Cache struct { + client *http.Client scrapers []scraper globalConfig GlobalConfig txnManager models.TransactionManager } +// newClient creates a scraper-local http client we use throughout the scraper subsystem. +func newClient(gc GlobalConfig) *http.Client { + client := &http.Client{ + Transport: &http.Transport{ // ignore insecure certificates + TLSClientConfig: &tls.Config{InsecureSkipVerify: !gc.GetScraperCertCheck()}, + MaxIdleConnsPerHost: maxIdleConnsPerHost, + }, + Timeout: scrapeGetTimeout, + // defaultCheckRedirect code with max changed from 10 to maxRedirects + CheckRedirect: func(req *http.Request, via []*http.Request) error { + if len(via) >= maxRedirects { + return fmt.Errorf("after %d redirects: %w", maxRedirects, ErrMaxRedirects) + } + return nil + }, + } + + return client +} + // NewCache returns a new Cache loading scraper configurations from the // scraper path provided in the global config object. It returns a new // instance and an error if the scraper directory could not be loaded. @@ -45,19 +86,23 @@ type Cache struct { // Scraper configurations are loaded from yml files in the provided scrapers // directory and any subdirectories. func NewCache(globalConfig GlobalConfig, txnManager models.TransactionManager) (*Cache, error) { - scrapers, err := loadScrapers(globalConfig, txnManager) + // HTTP Client setup + client := newClient(globalConfig) + + scrapers, err := loadScrapers(globalConfig, client, txnManager) if err != nil { return nil, err } return &Cache{ + client: client, globalConfig: globalConfig, scrapers: scrapers, txnManager: txnManager, }, nil } -func loadScrapers(globalConfig GlobalConfig, txnManager models.TransactionManager) ([]scraper, error) { +func loadScrapers(globalConfig GlobalConfig, client *http.Client, txnManager models.TransactionManager) ([]scraper, error) { path := globalConfig.GetScrapersPath() scrapers := make([]scraper, 0) @@ -76,14 +121,14 @@ func loadScrapers(globalConfig GlobalConfig, txnManager models.TransactionManage } // add built-in freeones scraper - scrapers = append(scrapers, getFreeonesScraper(txnManager, globalConfig), getAutoTagScraper(txnManager, globalConfig)) + scrapers = append(scrapers, getFreeonesScraper(client, txnManager, globalConfig), getAutoTagScraper(txnManager, globalConfig)) for _, file := range scraperFiles { c, err := loadConfigFromYAMLFile(file) if err != nil { logger.Errorf("Error loading scraper %s: %s", file, err.Error()) } else { - scraper := createScraperFromConfig(*c, txnManager, globalConfig) + scraper := createScraperFromConfig(*c, client, txnManager, globalConfig) scrapers = append(scrapers, scraper) } } @@ -95,7 +140,7 @@ func loadScrapers(globalConfig GlobalConfig, txnManager models.TransactionManage // In the event of an error during loading, the cache will be left empty. func (c *Cache) ReloadScrapers() error { c.scrapers = nil - scrapers, err := loadScrapers(c.globalConfig, c.txnManager) + scrapers, err := loadScrapers(c.globalConfig, c.client, c.txnManager) if err != nil { return err } @@ -255,7 +300,7 @@ func (c Cache) postScrapePerformer(ctx context.Context, ret *models.ScrapedPerfo } // post-process - set the image if applicable - if err := setPerformerImage(ctx, ret, c.globalConfig); err != nil { + if err := setPerformerImage(ctx, c.client, ret, c.globalConfig); err != nil { logger.Warnf("Could not set image using URL %s: %s", *ret.Image, err.Error()) } @@ -323,7 +368,7 @@ func (c Cache) postScrapeScene(ctx context.Context, ret *models.ScrapedScene) er } // post-process - set the image if applicable - if err := setSceneImage(ctx, ret, c.globalConfig); err != nil { + if err := setSceneImage(ctx, c.client, ret, c.globalConfig); err != nil { logger.Warnf("Could not set image using URL %s: %v", *ret.Image, err) } @@ -551,10 +596,10 @@ func (c Cache) ScrapeMovieURL(url string) (*models.ScrapedMovie, error) { } // post-process - set the image if applicable - if err := setMovieFrontImage(context.TODO(), ret, c.globalConfig); err != nil { + if err := setMovieFrontImage(context.TODO(), c.client, ret, c.globalConfig); err != nil { logger.Warnf("Could not set front image using URL %s: %s", *ret.FrontImage, err.Error()) } - if err := setMovieBackImage(context.TODO(), ret, c.globalConfig); err != nil { + if err := setMovieBackImage(context.TODO(), c.client, ret, c.globalConfig); err != nil { logger.Warnf("Could not set back image using URL %s: %s", *ret.BackImage, err.Error()) } diff --git a/pkg/scraper/stash.go b/pkg/scraper/stash.go index df83d5fa4..50f5cc12a 100644 --- a/pkg/scraper/stash.go +++ b/pkg/scraper/stash.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "errors" + "net/http" "strconv" "github.com/jinzhu/copier" @@ -16,13 +17,15 @@ type stashScraper struct { scraper scraperTypeConfig config config globalConfig GlobalConfig + client *http.Client txnManager models.TransactionManager } -func newStashScraper(scraper scraperTypeConfig, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *stashScraper { +func newStashScraper(scraper scraperTypeConfig, client *http.Client, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *stashScraper { return &stashScraper{ scraper: scraper, config: config, + client: client, globalConfig: globalConfig, txnManager: txnManager, } @@ -138,7 +141,7 @@ func (s *stashScraper) scrapePerformerByFragment(scrapedPerformer models.Scraped } // get the performer image directly - ret.Image, err = getStashPerformerImage(context.TODO(), s.config.StashServer.URL, performerID, s.globalConfig) + ret.Image, err = getStashPerformerImage(context.TODO(), s.config.StashServer.URL, performerID, s.client, s.globalConfig) if err != nil { return nil, err } @@ -164,7 +167,7 @@ func (s *stashScraper) scrapedStashSceneToScrapedScene(scene *scrapedSceneStash) } // get the performer image directly - ret.Image, err = getStashSceneImage(context.TODO(), s.config.StashServer.URL, scene.ID, s.globalConfig) + ret.Image, err = getStashSceneImage(context.TODO(), s.config.StashServer.URL, scene.ID, s.client, s.globalConfig) if err != nil { return nil, err } @@ -251,7 +254,7 @@ func (s *stashScraper) scrapeSceneByScene(scene *models.Scene) (*models.ScrapedS } // get the performer image directly - ret.Image, err = getStashSceneImage(context.TODO(), s.config.StashServer.URL, q.FindScene.ID, s.globalConfig) + ret.Image, err = getStashSceneImage(context.TODO(), s.config.StashServer.URL, q.FindScene.ID, s.client, s.globalConfig) if err != nil { return nil, err } diff --git a/pkg/scraper/stashbox/stash_box.go b/pkg/scraper/stashbox/stash_box.go index 71dc1b109..7556b9426 100644 --- a/pkg/scraper/stashbox/stash_box.go +++ b/pkg/scraper/stashbox/stash_box.go @@ -7,7 +7,6 @@ import ( "net/http" "strconv" "strings" - "time" "github.com/Yamashou/gqlgenc/client" @@ -18,10 +17,6 @@ import ( "github.com/stashapp/stash/pkg/utils" ) -// Timeout to get the image. Includes transfer time. May want to make this -// configurable at some point. -const imageGetTimeout = time.Second * 30 - // Client represents the client interface to a stash-box server instance. type Client struct { client *graphql.Client @@ -44,6 +39,10 @@ func NewClient(box models.StashBox, txnManager models.TransactionManager) *Clien } } +func (c Client) getHTTPClient() *http.Client { + return c.client.Client.Client +} + // QueryStashBoxScene queries stash-box for scenes using a query string. func (c Client) QueryStashBoxScene(ctx context.Context, queryStr string) ([]*models.ScrapedScene, error) { scenes, err := c.client.SearchScene(ctx, queryStr) @@ -55,7 +54,7 @@ func (c Client) QueryStashBoxScene(ctx context.Context, queryStr string) ([]*mod var ret []*models.ScrapedScene for _, s := range sceneFragments { - ss, err := sceneFragmentToScrapedScene(context.TODO(), c.txnManager, s) + ss, err := sceneFragmentToScrapedScene(context.TODO(), c.getHTTPClient(), c.txnManager, s) if err != nil { return nil, err } @@ -201,7 +200,7 @@ func (c Client) findStashBoxScenesByFingerprints(ctx context.Context, fingerprin sceneFragments := scenes.FindScenesByFingerprints for _, s := range sceneFragments { - ss, err := sceneFragmentToScrapedScene(ctx, c.txnManager, s) + ss, err := sceneFragmentToScrapedScene(ctx, c.getHTTPClient(), c.txnManager, s) if err != nil { return nil, err } @@ -509,11 +508,7 @@ func formatBodyModifications(m []*graphql.BodyModificationFragment) *string { return &ret } -func fetchImage(ctx context.Context, url string) (*string, error) { - client := &http.Client{ - Timeout: imageGetTimeout, - } - +func fetchImage(ctx context.Context, client *http.Client, url string) (*string, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { return nil, err @@ -595,8 +590,8 @@ func performerFragmentToScrapedScenePerformer(p graphql.PerformerFragment) *mode return sp } -func getFirstImage(ctx context.Context, images []*graphql.ImageFragment) *string { - ret, err := fetchImage(ctx, images[0].URL) +func getFirstImage(ctx context.Context, client *http.Client, images []*graphql.ImageFragment) *string { + ret, err := fetchImage(ctx, client, images[0].URL) if err != nil { logger.Warnf("Error fetching image %s: %s", images[0].URL, err.Error()) } @@ -617,7 +612,7 @@ func getFingerprints(scene *graphql.SceneFragment) []*models.StashBoxFingerprint return fingerprints } -func sceneFragmentToScrapedScene(ctx context.Context, txnManager models.TransactionManager, s *graphql.SceneFragment) (*models.ScrapedScene, error) { +func sceneFragmentToScrapedScene(ctx context.Context, client *http.Client, txnManager models.TransactionManager, s *graphql.SceneFragment) (*models.ScrapedScene, error) { stashID := s.ID ss := &models.ScrapedScene{ Title: s.Title, @@ -634,7 +629,7 @@ func sceneFragmentToScrapedScene(ctx context.Context, txnManager models.Transact if len(s.Images) > 0 { // TODO - #454 code sorts images by aspect ratio according to a wanted // orientation. I'm just grabbing the first for now - ss.Image = getFirstImage(ctx, s.Images) + ss.Image = getFirstImage(ctx, client, s.Images) } if err := txnManager.WithReadTxn(ctx, func(r models.ReaderRepository) error { diff --git a/pkg/scraper/url.go b/pkg/scraper/url.go index 516c26387..b07722d3f 100644 --- a/pkg/scraper/url.go +++ b/pkg/scraper/url.go @@ -3,12 +3,10 @@ package scraper import ( "bytes" "context" - "crypto/tls" - "errors" "fmt" "io" "net/http" - "net/http/cookiejar" + "net/url" "os" "strings" "time" @@ -18,55 +16,40 @@ import ( "github.com/chromedp/chromedp" jsoniter "github.com/json-iterator/go" "golang.org/x/net/html/charset" - "golang.org/x/net/publicsuffix" "github.com/stashapp/stash/pkg/logger" ) -// Timeout for the scrape http request. Includes transfer time. May want to make this -// configurable at some point. -const scrapeGetTimeout = time.Second * 60 const scrapeDefaultSleep = time.Second * 2 -func loadURL(ctx context.Context, url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) { +func loadURL(ctx context.Context, loadURL string, client *http.Client, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) { driverOptions := scraperConfig.DriverOptions if driverOptions != nil && driverOptions.UseCDP { // get the page using chrome dp - return urlFromCDP(ctx, url, *driverOptions, globalConfig) + return urlFromCDP(ctx, loadURL, *driverOptions, globalConfig) } - // get the page using http.Client - options := cookiejar.Options{ - PublicSuffixList: publicsuffix.List, - } - jar, er := cookiejar.New(&options) - if er != nil { - return nil, er - } - - setCookies(jar, scraperConfig) - printCookies(jar, scraperConfig, "Jar cookies set from scraper") - - client := &http.Client{ - Transport: &http.Transport{ // ignore insecure certificates - TLSClientConfig: &tls.Config{InsecureSkipVerify: !globalConfig.GetScraperCertCheck()}, - }, - Timeout: scrapeGetTimeout, - // defaultCheckRedirect code with max changed from 10 to 20 - CheckRedirect: func(req *http.Request, via []*http.Request) error { - if len(via) >= 20 { - return errors.New("stopped after 20 redirects") - } - return nil - }, - Jar: jar, - } - - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, loadURL, nil) if err != nil { return nil, err } + jar, err := scraperConfig.jar() + if err != nil { + return nil, fmt.Errorf("error creating cookie jar: %w", err) + } + + u, err := url.Parse(loadURL) + if err != nil { + return nil, fmt.Errorf("error parsing url %s: %w", loadURL, err) + } + + // Fetch relevant cookies from the jar for url u and add them to the request + cookies := jar.Cookies(u) + for _, cookie := range cookies { + req.AddCookie(cookie) + } + userAgent := globalConfig.GetScraperUserAgent() if userAgent != "" { req.Header.Set("User-Agent", userAgent) @@ -98,7 +81,6 @@ func loadURL(ctx context.Context, url string, scraperConfig config, globalConfig bodyReader := bytes.NewReader(body) printCookies(jar, scraperConfig, "Jar cookies found for scraper urls") - return charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) } diff --git a/pkg/scraper/xpath.go b/pkg/scraper/xpath.go index 5093bd099..0f820a4cd 100644 --- a/pkg/scraper/xpath.go +++ b/pkg/scraper/xpath.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "errors" + "net/http" "net/url" "regexp" "strings" @@ -20,14 +21,16 @@ type xpathScraper struct { scraper scraperTypeConfig config config globalConfig GlobalConfig + client *http.Client txnManager models.TransactionManager } -func newXpathScraper(scraper scraperTypeConfig, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *xpathScraper { +func newXpathScraper(scraper scraperTypeConfig, client *http.Client, txnManager models.TransactionManager, config config, globalConfig GlobalConfig) *xpathScraper { return &xpathScraper{ scraper: scraper, config: config, globalConfig: globalConfig, + client: client, txnManager: txnManager, } } @@ -227,7 +230,7 @@ func (s *xpathScraper) scrapeGalleryByFragment(gallery models.ScrapedGalleryInpu } func (s *xpathScraper) loadURL(ctx context.Context, url string) (*html.Node, error) { - r, err := loadURL(ctx, url, s.config, s.globalConfig) + r, err := loadURL(ctx, url, s.client, s.config, s.globalConfig) if err != nil { return nil, err } diff --git a/pkg/scraper/xpath_test.go b/pkg/scraper/xpath_test.go index fbffcc390..ff01741b7 100644 --- a/pkg/scraper/xpath_test.go +++ b/pkg/scraper/xpath_test.go @@ -874,7 +874,8 @@ xPathScrapers: globalConfig := mockGlobalConfig{} - s := createScraperFromConfig(*c, nil, globalConfig) + client := &http.Client{} + s := createScraperFromConfig(*c, client, nil, globalConfig) performer, err := s.Performer.scrapeByURL(ts.URL) if err != nil {