diff --git a/pkg/scraper/config.go b/pkg/scraper/config.go index 7d5d49ebf..4eeb97af3 100644 --- a/pkg/scraper/config.go +++ b/pkg/scraper/config.go @@ -175,11 +175,17 @@ type clickOptions struct { Sleep int `yaml:"sleep"` } +type header struct { + Key string `yaml:"Key"` + Value string `yaml:"Value"` +} + type scraperDriverOptions struct { UseCDP bool `yaml:"useCDP"` Sleep int `yaml:"sleep"` Clicks []*clickOptions `yaml:"clicks"` Cookies []*cookieOptions `yaml:"cookies"` + Headers []*header `yaml:"headers"` } func loadScraperFromYAML(id string, reader io.Reader) (*config, error) { diff --git a/pkg/scraper/url.go b/pkg/scraper/url.go index 4404dc067..baa35b07e 100644 --- a/pkg/scraper/url.go +++ b/pkg/scraper/url.go @@ -74,12 +74,21 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re req.Header.Set("User-Agent", userAgent) } + if driverOptions != nil { // setting the Headers after the UA allows us to override it from inside the scraper + for _, h := range driverOptions.Headers { + if h.Key != "" { + req.Header.Set(h.Key, h.Value) + logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value) + } + } + } + resp, err := client.Do(req) if err != nil { return nil, err } if resp.StatusCode >= 400 { - return nil, fmt.Errorf("http error %d", resp.StatusCode) + return nil, fmt.Errorf("http error %d:%s", resp.StatusCode, http.StatusText(resp.StatusCode)) } defer resp.Body.Close() @@ -156,10 +165,13 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo defer cancel() var res string + headers := cdpHeaders(driverOptions) + err := chromedp.Run(ctx, network.Enable(), setCDPCookies(driverOptions), printCDPCookies(driverOptions, "Cookies found"), + network.SetExtraHTTPHeaders(network.Headers(headers)), chromedp.Navigate(url), chromedp.Sleep(sleepDuration), setCDPClicks(driverOptions), @@ -241,3 +253,16 @@ func cdpNetwork(enable bool) chromedp.Action { return nil }) } + +func cdpHeaders(driverOptions scraperDriverOptions) map[string]interface{} { + headers := map[string]interface{}{} + if driverOptions.Headers != nil { + for _, h := range driverOptions.Headers { + if h.Key != "" { + headers[h.Key] = h.Value + logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value) + } + } + } + return headers +} diff --git a/ui/v2.5/src/components/Changelog/versions/v070.md b/ui/v2.5/src/components/Changelog/versions/v070.md index 6a76e8fb4..f0e5ebffa 100644 --- a/ui/v2.5/src/components/Changelog/versions/v070.md +++ b/ui/v2.5/src/components/Changelog/versions/v070.md @@ -4,6 +4,7 @@ * Added scene queue. ### 🎨 Improvements +* Support http request headers in scrapers. * Sort performers by gender in scene/image/gallery cards and details. * Add popover buttons for scenes/images/galleries on performer/studio/tag cards. * Add slideshow to image wall view. diff --git a/ui/v2.5/src/docs/en/Scraping.md b/ui/v2.5/src/docs/en/Scraping.md index 6f47d0070..43a4407e3 100644 --- a/ui/v2.5/src/docs/en/Scraping.md +++ b/ui/v2.5/src/docs/en/Scraping.md @@ -544,6 +544,24 @@ When developing a scraper you can have a look at the cookies set by a site by ad and having a look at the log / console in debug mode. +### Headers + +Sending request headers is possible when using a scraper. +Headers can be set in the `driver` section and are supported for plain, CDP enabled and JSON scrapers. +They consist of a Key and a Value. If the the Key is empty or not defined then the header is ignored. + +```yaml +driver: + headers: + - Key: User-Agent + Value: My Stash Scraper + - Key: Authorization + Value: Bearer ds3sdfcFdfY17p4qBkTVF03zscUU2glSjWF17bZyoe8 +``` + +* headers are set after stash's `User-Agent` configuration option is applied. +This means setting a `User-Agent` header from the scraper overrides the one in the configuration settings. + ### XPath scraper example A performer and scene xpath scraper is shown as an example below: @@ -614,31 +632,42 @@ A performer and scene scraper for ThePornDB is shown below: name: ThePornDB performerByName: action: scrapeJson - queryURL: https://metadataapi.net/api/performers?q={} + queryURL: https://api.metadataapi.net/performers?q={} scraper: performerSearch performerByURL: - action: scrapeJson url: - - https://metadataapi.net/api/performers/ + - https://api.metadataapi.net/performers/ scraper: performerScraper sceneByURL: - action: scrapeJson url: - - https://metadataapi.net/api/scenes/ + - https://api.metadataapi.net/scenes/ scraper: sceneScraper sceneByFragment: action: scrapeJson - queryURL: https://metadataapi.net/api/scenes?parse={filename}&limit=1 + queryURL: https://api.metadataapi.net/scenes?parse={filename}&hash={oshash}&limit=1 scraper: sceneQueryScraper + queryURLReplace: + filename: + - regex: "[^a-zA-Z\\d\\-._~]" # clean filename so that it can contruct a valid url + with: "." # "%20" + - regex: HEVC + with: + - regex: x265 + with: + - regex: \.+ + with: "." jsonScrapers: performerSearch: performer: Name: data.#.name URL: selector: data.#.id - replace: - - regex: ^ - with: https://metadataapi.net/api/performers/ + postProcess: + - replace: + - regex: ^ + with: https://api.metadataapi.net/performers/ performerScraper: common: @@ -648,7 +677,12 @@ jsonScrapers: Gender: $extras.gender Birthdate: $extras.birthday Ethnicity: $extras.ethnicity - Height: $extras.height + Height: + selector: $extras.height + postProcess: + - replace: + - regex: cm + with: Measurements: $extras.measurements Tattoos: $extras.tattoos Piercings: $extras.piercings @@ -670,7 +704,7 @@ jsonScrapers: Name: data.site.name Tags: Name: data.tags.#.tag - + sceneQueryScraper: common: $data: data.0 @@ -686,7 +720,14 @@ jsonScrapers: Studio: Name: $data.site.name Tags: - Name: $data.tags.#.tag + Name: $data.tags.#.tag +driver: + headers: + - Key: User-Agent + Value: Stash JSON Scraper + - Key: Authorization + Value: Bearer lPdwFdfY17p4qBkTVF03zscUU2glSjdf17bZyoe # use an actual API Key here +# Last Updated April 7, 2021 ``` ## Object fields