mirror of
https://github.com/stashapp/stash.git
synced 2025-12-06 08:26:00 +01:00
Feature: Support inputURL and inputHostname in scrapers (#6250)
This commit is contained in:
parent
f434c1f529
commit
678b3de7c8
4 changed files with 102 additions and 18 deletions
|
|
@ -80,7 +80,7 @@ func (s *jsonScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCont
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getJsonQuery(doc)
|
q := s.getJsonQuery(doc, u)
|
||||||
// if these just return the return values from scraper.scrape* functions then
|
// if these just return the return values from scraper.scrape* functions then
|
||||||
// it ends up returning ScrapedContent(nil) rather than nil
|
// it ends up returning ScrapedContent(nil) rather than nil
|
||||||
switch ty {
|
switch ty {
|
||||||
|
|
@ -140,7 +140,7 @@ func (s *jsonScraper) scrapeByName(ctx context.Context, name string, ty ScrapeCo
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getJsonQuery(doc)
|
q := s.getJsonQuery(doc, url)
|
||||||
q.setType(SearchQuery)
|
q.setType(SearchQuery)
|
||||||
|
|
||||||
var content []ScrapedContent
|
var content []ScrapedContent
|
||||||
|
|
@ -192,7 +192,7 @@ func (s *jsonScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scen
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getJsonQuery(doc)
|
q := s.getJsonQuery(doc, url)
|
||||||
return scraper.scrapeScene(ctx, q)
|
return scraper.scrapeScene(ctx, q)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -227,7 +227,7 @@ func (s *jsonScraper) scrapeByFragment(ctx context.Context, input Input) (Scrape
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getJsonQuery(doc)
|
q := s.getJsonQuery(doc, url)
|
||||||
return scraper.scrapeScene(ctx, q)
|
return scraper.scrapeScene(ctx, q)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -251,7 +251,7 @@ func (s *jsonScraper) scrapeImageByImage(ctx context.Context, image *models.Imag
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getJsonQuery(doc)
|
q := s.getJsonQuery(doc, url)
|
||||||
return scraper.scrapeImage(ctx, q)
|
return scraper.scrapeImage(ctx, q)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -275,14 +275,15 @@ func (s *jsonScraper) scrapeGalleryByGallery(ctx context.Context, gallery *model
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getJsonQuery(doc)
|
q := s.getJsonQuery(doc, url)
|
||||||
return scraper.scrapeGallery(ctx, q)
|
return scraper.scrapeGallery(ctx, q)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *jsonScraper) getJsonQuery(doc string) *jsonQuery {
|
func (s *jsonScraper) getJsonQuery(doc string, url string) *jsonQuery {
|
||||||
return &jsonQuery{
|
return &jsonQuery{
|
||||||
doc: doc,
|
doc: doc,
|
||||||
scraper: s,
|
scraper: s,
|
||||||
|
url: url,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -290,6 +291,7 @@ type jsonQuery struct {
|
||||||
doc string
|
doc string
|
||||||
scraper *jsonScraper
|
scraper *jsonScraper
|
||||||
queryType QueryType
|
queryType QueryType
|
||||||
|
url string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *jsonQuery) getType() QueryType {
|
func (q *jsonQuery) getType() QueryType {
|
||||||
|
|
@ -300,6 +302,10 @@ func (q *jsonQuery) setType(t QueryType) {
|
||||||
q.queryType = t
|
q.queryType = t
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *jsonQuery) getURL() string {
|
||||||
|
return q.url
|
||||||
|
}
|
||||||
|
|
||||||
func (q *jsonQuery) runQuery(selector string) ([]string, error) {
|
func (q *jsonQuery) runQuery(selector string) ([]string, error) {
|
||||||
value := gjson.Get(q.doc, selector)
|
value := gjson.Get(q.doc, selector)
|
||||||
|
|
||||||
|
|
@ -331,5 +337,5 @@ func (q *jsonQuery) subScrape(ctx context.Context, value string) mappedQuery {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return q.scraper.getJsonQuery(doc)
|
return q.scraper.getJsonQuery(doc, value)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"net/url"
|
||||||
"reflect"
|
"reflect"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
|
@ -24,6 +25,7 @@ type mappedQuery interface {
|
||||||
getType() QueryType
|
getType() QueryType
|
||||||
setType(QueryType)
|
setType(QueryType)
|
||||||
subScrape(ctx context.Context, value string) mappedQuery
|
subScrape(ctx context.Context, value string) mappedQuery
|
||||||
|
getURL() string
|
||||||
}
|
}
|
||||||
|
|
||||||
type commonMappedConfig map[string]string
|
type commonMappedConfig map[string]string
|
||||||
|
|
@ -43,6 +45,22 @@ func (s mappedConfig) applyCommon(c commonMappedConfig, src string) string {
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// extractHostname parses a URL string and returns the hostname.
|
||||||
|
// Returns empty string if the URL cannot be parsed.
|
||||||
|
func extractHostname(urlStr string) string {
|
||||||
|
if urlStr == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
u, err := url.Parse(urlStr)
|
||||||
|
if err != nil {
|
||||||
|
logger.Warnf("Error parsing URL '%s': %s", urlStr, err.Error())
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
return u.Hostname()
|
||||||
|
}
|
||||||
|
|
||||||
type isMultiFunc func(key string) bool
|
type isMultiFunc func(key string) bool
|
||||||
|
|
||||||
func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig, isMulti isMultiFunc) mappedResults {
|
func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig, isMulti isMultiFunc) mappedResults {
|
||||||
|
|
@ -53,10 +71,16 @@ func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonM
|
||||||
if attrConfig.Fixed != "" {
|
if attrConfig.Fixed != "" {
|
||||||
// TODO - not sure if this needs to set _all_ indexes for the key
|
// TODO - not sure if this needs to set _all_ indexes for the key
|
||||||
const i = 0
|
const i = 0
|
||||||
ret = ret.setSingleValue(i, k, attrConfig.Fixed)
|
// Support {inputURL} and {inputHostname} placeholders in fixed values
|
||||||
|
value := strings.ReplaceAll(attrConfig.Fixed, "{inputURL}", q.getURL())
|
||||||
|
value = strings.ReplaceAll(value, "{inputHostname}", extractHostname(q.getURL()))
|
||||||
|
ret = ret.setSingleValue(i, k, value)
|
||||||
} else {
|
} else {
|
||||||
selector := attrConfig.Selector
|
selector := attrConfig.Selector
|
||||||
selector = s.applyCommon(common, selector)
|
selector = s.applyCommon(common, selector)
|
||||||
|
// Support {inputURL} and {inputHostname} placeholders in selectors
|
||||||
|
selector = strings.ReplaceAll(selector, "{inputURL}", q.getURL())
|
||||||
|
selector = strings.ReplaceAll(selector, "{inputHostname}", extractHostname(q.getURL()))
|
||||||
|
|
||||||
found, err := q.runQuery(selector)
|
found, err := q.runQuery(selector)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ func (s *xpathScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCon
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getXPathQuery(doc)
|
q := s.getXPathQuery(doc, u)
|
||||||
// if these just return the return values from scraper.scrape* functions then
|
// if these just return the return values from scraper.scrape* functions then
|
||||||
// it ends up returning ScrapedContent(nil) rather than nil
|
// it ends up returning ScrapedContent(nil) rather than nil
|
||||||
switch ty {
|
switch ty {
|
||||||
|
|
@ -121,7 +121,7 @@ func (s *xpathScraper) scrapeByName(ctx context.Context, name string, ty ScrapeC
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getXPathQuery(doc)
|
q := s.getXPathQuery(doc, url)
|
||||||
q.setType(SearchQuery)
|
q.setType(SearchQuery)
|
||||||
|
|
||||||
var content []ScrapedContent
|
var content []ScrapedContent
|
||||||
|
|
@ -171,7 +171,7 @@ func (s *xpathScraper) scrapeSceneByScene(ctx context.Context, scene *models.Sce
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getXPathQuery(doc)
|
q := s.getXPathQuery(doc, url)
|
||||||
return scraper.scrapeScene(ctx, q)
|
return scraper.scrapeScene(ctx, q)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -206,7 +206,7 @@ func (s *xpathScraper) scrapeByFragment(ctx context.Context, input Input) (Scrap
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getXPathQuery(doc)
|
q := s.getXPathQuery(doc, url)
|
||||||
return scraper.scrapeScene(ctx, q)
|
return scraper.scrapeScene(ctx, q)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -230,7 +230,7 @@ func (s *xpathScraper) scrapeGalleryByGallery(ctx context.Context, gallery *mode
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getXPathQuery(doc)
|
q := s.getXPathQuery(doc, url)
|
||||||
return scraper.scrapeGallery(ctx, q)
|
return scraper.scrapeGallery(ctx, q)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -254,7 +254,7 @@ func (s *xpathScraper) scrapeImageByImage(ctx context.Context, image *models.Ima
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
q := s.getXPathQuery(doc)
|
q := s.getXPathQuery(doc, url)
|
||||||
return scraper.scrapeImage(ctx, q)
|
return scraper.scrapeImage(ctx, q)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -277,10 +277,11 @@ func (s *xpathScraper) loadURL(ctx context.Context, url string) (*html.Node, err
|
||||||
return ret, err
|
return ret, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *xpathScraper) getXPathQuery(doc *html.Node) *xpathQuery {
|
func (s *xpathScraper) getXPathQuery(doc *html.Node, url string) *xpathQuery {
|
||||||
return &xpathQuery{
|
return &xpathQuery{
|
||||||
doc: doc,
|
doc: doc,
|
||||||
scraper: s,
|
scraper: s,
|
||||||
|
url: url,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -288,6 +289,7 @@ type xpathQuery struct {
|
||||||
doc *html.Node
|
doc *html.Node
|
||||||
scraper *xpathScraper
|
scraper *xpathScraper
|
||||||
queryType QueryType
|
queryType QueryType
|
||||||
|
url string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *xpathQuery) getType() QueryType {
|
func (q *xpathQuery) getType() QueryType {
|
||||||
|
|
@ -298,6 +300,10 @@ func (q *xpathQuery) setType(t QueryType) {
|
||||||
q.queryType = t
|
q.queryType = t
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *xpathQuery) getURL() string {
|
||||||
|
return q.url
|
||||||
|
}
|
||||||
|
|
||||||
func (q *xpathQuery) runQuery(selector string) ([]string, error) {
|
func (q *xpathQuery) runQuery(selector string) ([]string, error) {
|
||||||
found, err := htmlquery.QueryAll(q.doc, selector)
|
found, err := htmlquery.QueryAll(q.doc, selector)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -346,5 +352,5 @@ func (q *xpathQuery) subScrape(ctx context.Context, value string) mappedQuery {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return q.scraper.getXPathQuery(doc)
|
return q.scraper.getXPathQuery(doc, value)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -325,10 +325,58 @@ Alternatively, an attribute value may be set to a fixed value, rather than scrap
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
performer:
|
performer:
|
||||||
Gender:
|
Gender:
|
||||||
fixed: Female
|
fixed: Female
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Input URL placeholders
|
||||||
|
|
||||||
|
The `{inputURL}` and `{inputHostname}` placeholders can be used in both `fixed` values and `selector` expressions to access information about the original URL that was used to scrape the content.
|
||||||
|
|
||||||
|
#### {inputURL}
|
||||||
|
|
||||||
|
The `{inputURL}` placeholder provides access to the full URL. This is useful when you want to return or reference the source URL as part of the scraped data.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
scene:
|
||||||
|
URL:
|
||||||
|
fixed: "{inputURL}"
|
||||||
|
Title:
|
||||||
|
selector: //h1[@class="title"]
|
||||||
|
```
|
||||||
|
|
||||||
|
When scraping from `https://example.com/scene/12345`, the `{inputURL}` placeholder will be replaced with `https://example.com/scene/12345`.
|
||||||
|
|
||||||
|
#### {inputHostname}
|
||||||
|
|
||||||
|
The `{inputHostname}` placeholder extracts just the hostname from the URL. This is useful when you need to reference the domain without manually parsing the URL.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
scene:
|
||||||
|
Studio:
|
||||||
|
fixed: "{inputHostname}"
|
||||||
|
Details:
|
||||||
|
selector: //div[@data-domain="{inputHostname}"]//p[@class="description"]
|
||||||
|
```
|
||||||
|
|
||||||
|
When scraping from `https://example.com/scene/12345`, the `{inputHostname}` placeholder will be replaced with `example.com`.
|
||||||
|
|
||||||
|
These placeholders can also be used within selectors for more advanced use cases:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
scene:
|
||||||
|
Details:
|
||||||
|
selector: //div[@data-url="{inputURL}"]//p[@class="description"]
|
||||||
|
Site:
|
||||||
|
selector: //div[@data-host="{inputHostname}"]//span[@class="site-name"]
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Note:** These placeholders represent the actual URL used to fetch the content, after any URL replacements have been applied.
|
||||||
|
|
||||||
### Common fragments
|
### Common fragments
|
||||||
|
|
||||||
The `common` field is used to configure selector fragments that can be referenced in the selector strings. These are key-value pairs where the key is the string to reference the fragment, and the value is the string that the fragment will be replaced with. For example:
|
The `common` field is used to configure selector fragments that can be referenced in the selector strings. These are key-value pairs where the key is the string to reference the fragment, and the value is the string that the fragment will be replaced with. For example:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue