mirror of
https://github.com/stashapp/stash.git
synced 2025-12-15 04:44:28 +01:00
Add http headers support to scraper (#1273)
This commit is contained in:
parent
0b40017b09
commit
cd6b6b74eb
4 changed files with 84 additions and 11 deletions
|
|
@ -175,11 +175,17 @@ type clickOptions struct {
|
|||
Sleep int `yaml:"sleep"`
|
||||
}
|
||||
|
||||
type header struct {
|
||||
Key string `yaml:"Key"`
|
||||
Value string `yaml:"Value"`
|
||||
}
|
||||
|
||||
type scraperDriverOptions struct {
|
||||
UseCDP bool `yaml:"useCDP"`
|
||||
Sleep int `yaml:"sleep"`
|
||||
Clicks []*clickOptions `yaml:"clicks"`
|
||||
Cookies []*cookieOptions `yaml:"cookies"`
|
||||
Headers []*header `yaml:"headers"`
|
||||
}
|
||||
|
||||
func loadScraperFromYAML(id string, reader io.Reader) (*config, error) {
|
||||
|
|
|
|||
|
|
@ -74,12 +74,21 @@ func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Re
|
|||
req.Header.Set("User-Agent", userAgent)
|
||||
}
|
||||
|
||||
if driverOptions != nil { // setting the Headers after the UA allows us to override it from inside the scraper
|
||||
for _, h := range driverOptions.Headers {
|
||||
if h.Key != "" {
|
||||
req.Header.Set(h.Key, h.Value)
|
||||
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if resp.StatusCode >= 400 {
|
||||
return nil, fmt.Errorf("http error %d", resp.StatusCode)
|
||||
return nil, fmt.Errorf("http error %d:%s", resp.StatusCode, http.StatusText(resp.StatusCode))
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
|
@ -156,10 +165,13 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
|
|||
defer cancel()
|
||||
|
||||
var res string
|
||||
headers := cdpHeaders(driverOptions)
|
||||
|
||||
err := chromedp.Run(ctx,
|
||||
network.Enable(),
|
||||
setCDPCookies(driverOptions),
|
||||
printCDPCookies(driverOptions, "Cookies found"),
|
||||
network.SetExtraHTTPHeaders(network.Headers(headers)),
|
||||
chromedp.Navigate(url),
|
||||
chromedp.Sleep(sleepDuration),
|
||||
setCDPClicks(driverOptions),
|
||||
|
|
@ -241,3 +253,16 @@ func cdpNetwork(enable bool) chromedp.Action {
|
|||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
func cdpHeaders(driverOptions scraperDriverOptions) map[string]interface{} {
|
||||
headers := map[string]interface{}{}
|
||||
if driverOptions.Headers != nil {
|
||||
for _, h := range driverOptions.Headers {
|
||||
if h.Key != "" {
|
||||
headers[h.Key] = h.Value
|
||||
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
return headers
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
* Added scene queue.
|
||||
|
||||
### 🎨 Improvements
|
||||
* Support http request headers in scrapers.
|
||||
* Sort performers by gender in scene/image/gallery cards and details.
|
||||
* Add popover buttons for scenes/images/galleries on performer/studio/tag cards.
|
||||
* Add slideshow to image wall view.
|
||||
|
|
|
|||
|
|
@ -544,6 +544,24 @@ When developing a scraper you can have a look at the cookies set by a site by ad
|
|||
|
||||
and having a look at the log / console in debug mode.
|
||||
|
||||
### Headers
|
||||
|
||||
Sending request headers is possible when using a scraper.
|
||||
Headers can be set in the `driver` section and are supported for plain, CDP enabled and JSON scrapers.
|
||||
They consist of a Key and a Value. If the the Key is empty or not defined then the header is ignored.
|
||||
|
||||
```yaml
|
||||
driver:
|
||||
headers:
|
||||
- Key: User-Agent
|
||||
Value: My Stash Scraper
|
||||
- Key: Authorization
|
||||
Value: Bearer ds3sdfcFdfY17p4qBkTVF03zscUU2glSjWF17bZyoe8
|
||||
```
|
||||
|
||||
* headers are set after stash's `User-Agent` configuration option is applied.
|
||||
This means setting a `User-Agent` header from the scraper overrides the one in the configuration settings.
|
||||
|
||||
### XPath scraper example
|
||||
|
||||
A performer and scene xpath scraper is shown as an example below:
|
||||
|
|
@ -614,31 +632,42 @@ A performer and scene scraper for ThePornDB is shown below:
|
|||
name: ThePornDB
|
||||
performerByName:
|
||||
action: scrapeJson
|
||||
queryURL: https://metadataapi.net/api/performers?q={}
|
||||
queryURL: https://api.metadataapi.net/performers?q={}
|
||||
scraper: performerSearch
|
||||
performerByURL:
|
||||
- action: scrapeJson
|
||||
url:
|
||||
- https://metadataapi.net/api/performers/
|
||||
- https://api.metadataapi.net/performers/
|
||||
scraper: performerScraper
|
||||
sceneByURL:
|
||||
- action: scrapeJson
|
||||
url:
|
||||
- https://metadataapi.net/api/scenes/
|
||||
- https://api.metadataapi.net/scenes/
|
||||
scraper: sceneScraper
|
||||
sceneByFragment:
|
||||
action: scrapeJson
|
||||
queryURL: https://metadataapi.net/api/scenes?parse={filename}&limit=1
|
||||
queryURL: https://api.metadataapi.net/scenes?parse={filename}&hash={oshash}&limit=1
|
||||
scraper: sceneQueryScraper
|
||||
queryURLReplace:
|
||||
filename:
|
||||
- regex: "[^a-zA-Z\\d\\-._~]" # clean filename so that it can contruct a valid url
|
||||
with: "." # "%20"
|
||||
- regex: HEVC
|
||||
with:
|
||||
- regex: x265
|
||||
with:
|
||||
- regex: \.+
|
||||
with: "."
|
||||
jsonScrapers:
|
||||
performerSearch:
|
||||
performer:
|
||||
Name: data.#.name
|
||||
URL:
|
||||
selector: data.#.id
|
||||
replace:
|
||||
- regex: ^
|
||||
with: https://metadataapi.net/api/performers/
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: ^
|
||||
with: https://api.metadataapi.net/performers/
|
||||
|
||||
performerScraper:
|
||||
common:
|
||||
|
|
@ -648,7 +677,12 @@ jsonScrapers:
|
|||
Gender: $extras.gender
|
||||
Birthdate: $extras.birthday
|
||||
Ethnicity: $extras.ethnicity
|
||||
Height: $extras.height
|
||||
Height:
|
||||
selector: $extras.height
|
||||
postProcess:
|
||||
- replace:
|
||||
- regex: cm
|
||||
with:
|
||||
Measurements: $extras.measurements
|
||||
Tattoos: $extras.tattoos
|
||||
Piercings: $extras.piercings
|
||||
|
|
@ -670,7 +704,7 @@ jsonScrapers:
|
|||
Name: data.site.name
|
||||
Tags:
|
||||
Name: data.tags.#.tag
|
||||
|
||||
|
||||
sceneQueryScraper:
|
||||
common:
|
||||
$data: data.0
|
||||
|
|
@ -686,7 +720,14 @@ jsonScrapers:
|
|||
Studio:
|
||||
Name: $data.site.name
|
||||
Tags:
|
||||
Name: $data.tags.#.tag
|
||||
Name: $data.tags.#.tag
|
||||
driver:
|
||||
headers:
|
||||
- Key: User-Agent
|
||||
Value: Stash JSON Scraper
|
||||
- Key: Authorization
|
||||
Value: Bearer lPdwFdfY17p4qBkTVF03zscUU2glSjdf17bZyoe # use an actual API Key here
|
||||
# Last Updated April 7, 2021
|
||||
```
|
||||
|
||||
## Object fields
|
||||
|
|
|
|||
Loading…
Reference in a new issue