Add http headers support to scraper (#1273)

This commit is contained in:
bnkai 2021-04-16 08:42:56 +03:00 committed by GitHub
parent 0b40017b09
commit cd6b6b74eb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 84 additions and 11 deletions

View file

@ -175,11 +175,17 @@ type clickOptions struct {
Sleep int `yaml:"sleep"`
}
type header struct {
Key string `yaml:"Key"`
Value string `yaml:"Value"`
}
type scraperDriverOptions struct {
UseCDP bool `yaml:"useCDP"`
Sleep int `yaml:"sleep"`
Clicks []*clickOptions `yaml:"clicks"`
Cookies []*cookieOptions `yaml:"cookies"`
Headers []*header `yaml:"headers"`
}
func loadScraperFromYAML(id string, reader io.Reader) (*config, error) {

View file

@ -74,12 +74,21 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re
req.Header.Set("User-Agent", userAgent)
}
if driverOptions != nil { // setting the Headers after the UA allows us to override it from inside the scraper
for _, h := range driverOptions.Headers {
if h.Key != "" {
req.Header.Set(h.Key, h.Value)
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
}
}
}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("http error %d", resp.StatusCode)
return nil, fmt.Errorf("http error %d:%s", resp.StatusCode, http.StatusText(resp.StatusCode))
}
defer resp.Body.Close()
@ -156,10 +165,13 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
defer cancel()
var res string
headers := cdpHeaders(driverOptions)
err := chromedp.Run(ctx,
network.Enable(),
setCDPCookies(driverOptions),
printCDPCookies(driverOptions, "Cookies found"),
network.SetExtraHTTPHeaders(network.Headers(headers)),
chromedp.Navigate(url),
chromedp.Sleep(sleepDuration),
setCDPClicks(driverOptions),
@ -241,3 +253,16 @@ func cdpNetwork(enable bool) chromedp.Action {
return nil
})
}
func cdpHeaders(driverOptions scraperDriverOptions) map[string]interface{} {
headers := map[string]interface{}{}
if driverOptions.Headers != nil {
for _, h := range driverOptions.Headers {
if h.Key != "" {
headers[h.Key] = h.Value
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
}
}
}
return headers
}

View file

@ -4,6 +4,7 @@
* Added scene queue.
### 🎨 Improvements
* Support http request headers in scrapers.
* Sort performers by gender in scene/image/gallery cards and details.
* Add popover buttons for scenes/images/galleries on performer/studio/tag cards.
* Add slideshow to image wall view.

View file

@ -544,6 +544,24 @@ When developing a scraper you can have a look at the cookies set by a site by ad
and having a look at the log / console in debug mode.
### Headers
Sending request headers is possible when using a scraper.
Headers can be set in the `driver` section and are supported for plain, CDP enabled and JSON scrapers.
They consist of a Key and a Value. If the the Key is empty or not defined then the header is ignored.
```yaml
driver:
headers:
- Key: User-Agent
Value: My Stash Scraper
- Key: Authorization
Value: Bearer ds3sdfcFdfY17p4qBkTVF03zscUU2glSjWF17bZyoe8
```
* headers are set after stash's `User-Agent` configuration option is applied.
This means setting a `User-Agent` header from the scraper overrides the one in the configuration settings.
### XPath scraper example
A performer and scene xpath scraper is shown as an example below:
@ -614,31 +632,42 @@ A performer and scene scraper for ThePornDB is shown below:
name: ThePornDB
performerByName:
action: scrapeJson
queryURL: https://metadataapi.net/api/performers?q={}
queryURL: https://api.metadataapi.net/performers?q={}
scraper: performerSearch
performerByURL:
- action: scrapeJson
url:
- https://metadataapi.net/api/performers/
- https://api.metadataapi.net/performers/
scraper: performerScraper
sceneByURL:
- action: scrapeJson
url:
- https://metadataapi.net/api/scenes/
- https://api.metadataapi.net/scenes/
scraper: sceneScraper
sceneByFragment:
action: scrapeJson
queryURL: https://metadataapi.net/api/scenes?parse={filename}&limit=1
queryURL: https://api.metadataapi.net/scenes?parse={filename}&hash={oshash}&limit=1
scraper: sceneQueryScraper
queryURLReplace:
filename:
- regex: "[^a-zA-Z\\d\\-._~]" # clean filename so that it can contruct a valid url
with: "." # "%20"
- regex: HEVC
with:
- regex: x265
with:
- regex: \.+
with: "."
jsonScrapers:
performerSearch:
performer:
Name: data.#.name
URL:
selector: data.#.id
replace:
- regex: ^
with: https://metadataapi.net/api/performers/
postProcess:
- replace:
- regex: ^
with: https://api.metadataapi.net/performers/
performerScraper:
common:
@ -648,7 +677,12 @@ jsonScrapers:
Gender: $extras.gender
Birthdate: $extras.birthday
Ethnicity: $extras.ethnicity
Height: $extras.height
Height:
selector: $extras.height
postProcess:
- replace:
- regex: cm
with:
Measurements: $extras.measurements
Tattoos: $extras.tattoos
Piercings: $extras.piercings
@ -670,7 +704,7 @@ jsonScrapers:
Name: data.site.name
Tags:
Name: data.tags.#.tag
sceneQueryScraper:
common:
$data: data.0
@ -686,7 +720,14 @@ jsonScrapers:
Studio:
Name: $data.site.name
Tags:
Name: $data.tags.#.tag
Name: $data.tags.#.tag
driver:
headers:
- Key: User-Agent
Value: Stash JSON Scraper
- Key: Authorization
Value: Bearer lPdwFdfY17p4qBkTVF03zscUU2glSjdf17bZyoe # use an actual API Key here
# Last Updated April 7, 2021
```
## Object fields