Feature: Support inputURL and inputHostname in scrapers (#6250)

This commit is contained in:
Gykes 2025-11-09 20:00:47 -08:00 committed by GitHub
parent f434c1f529
commit 678b3de7c8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 102 additions and 18 deletions

View file

@ -80,7 +80,7 @@ func (s *jsonScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCont
return nil, err return nil, err
} }
q := s.getJsonQuery(doc) q := s.getJsonQuery(doc, u)
// if these just return the return values from scraper.scrape* functions then // if these just return the return values from scraper.scrape* functions then
// it ends up returning ScrapedContent(nil) rather than nil // it ends up returning ScrapedContent(nil) rather than nil
switch ty { switch ty {
@ -140,7 +140,7 @@ func (s *jsonScraper) scrapeByName(ctx context.Context, name string, ty ScrapeCo
return nil, err return nil, err
} }
q := s.getJsonQuery(doc) q := s.getJsonQuery(doc, url)
q.setType(SearchQuery) q.setType(SearchQuery)
var content []ScrapedContent var content []ScrapedContent
@ -192,7 +192,7 @@ func (s *jsonScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scen
return nil, err return nil, err
} }
q := s.getJsonQuery(doc) q := s.getJsonQuery(doc, url)
return scraper.scrapeScene(ctx, q) return scraper.scrapeScene(ctx, q)
} }
@ -227,7 +227,7 @@ func (s *jsonScraper) scrapeByFragment(ctx context.Context, input Input) (Scrape
return nil, err return nil, err
} }
q := s.getJsonQuery(doc) q := s.getJsonQuery(doc, url)
return scraper.scrapeScene(ctx, q) return scraper.scrapeScene(ctx, q)
} }
@ -251,7 +251,7 @@ func (s *jsonScraper) scrapeImageByImage(ctx context.Context, image *models.Imag
return nil, err return nil, err
} }
q := s.getJsonQuery(doc) q := s.getJsonQuery(doc, url)
return scraper.scrapeImage(ctx, q) return scraper.scrapeImage(ctx, q)
} }
@ -275,14 +275,15 @@ func (s *jsonScraper) scrapeGalleryByGallery(ctx context.Context, gallery *model
return nil, err return nil, err
} }
q := s.getJsonQuery(doc) q := s.getJsonQuery(doc, url)
return scraper.scrapeGallery(ctx, q) return scraper.scrapeGallery(ctx, q)
} }
func (s *jsonScraper) getJsonQuery(doc string) *jsonQuery { func (s *jsonScraper) getJsonQuery(doc string, url string) *jsonQuery {
return &jsonQuery{ return &jsonQuery{
doc: doc, doc: doc,
scraper: s, scraper: s,
url: url,
} }
} }
@ -290,6 +291,7 @@ type jsonQuery struct {
doc string doc string
scraper *jsonScraper scraper *jsonScraper
queryType QueryType queryType QueryType
url string
} }
func (q *jsonQuery) getType() QueryType { func (q *jsonQuery) getType() QueryType {
@ -300,6 +302,10 @@ func (q *jsonQuery) setType(t QueryType) {
q.queryType = t q.queryType = t
} }
func (q *jsonQuery) getURL() string {
return q.url
}
func (q *jsonQuery) runQuery(selector string) ([]string, error) { func (q *jsonQuery) runQuery(selector string) ([]string, error) {
value := gjson.Get(q.doc, selector) value := gjson.Get(q.doc, selector)
@ -331,5 +337,5 @@ func (q *jsonQuery) subScrape(ctx context.Context, value string) mappedQuery {
return nil return nil
} }
return q.scraper.getJsonQuery(doc) return q.scraper.getJsonQuery(doc, value)
} }

View file

@ -5,6 +5,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"math" "math"
"net/url"
"reflect" "reflect"
"regexp" "regexp"
"strconv" "strconv"
@ -24,6 +25,7 @@ type mappedQuery interface {
getType() QueryType getType() QueryType
setType(QueryType) setType(QueryType)
subScrape(ctx context.Context, value string) mappedQuery subScrape(ctx context.Context, value string) mappedQuery
getURL() string
} }
type commonMappedConfig map[string]string type commonMappedConfig map[string]string
@ -43,6 +45,22 @@ func (s mappedConfig) applyCommon(c commonMappedConfig, src string) string {
return ret return ret
} }
// extractHostname parses a URL string and returns the hostname.
// Returns empty string if the URL cannot be parsed.
func extractHostname(urlStr string) string {
if urlStr == "" {
return ""
}
u, err := url.Parse(urlStr)
if err != nil {
logger.Warnf("Error parsing URL '%s': %s", urlStr, err.Error())
return ""
}
return u.Hostname()
}
type isMultiFunc func(key string) bool type isMultiFunc func(key string) bool
func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig, isMulti isMultiFunc) mappedResults { func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig, isMulti isMultiFunc) mappedResults {
@ -53,10 +71,16 @@ func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonM
if attrConfig.Fixed != "" { if attrConfig.Fixed != "" {
// TODO - not sure if this needs to set _all_ indexes for the key // TODO - not sure if this needs to set _all_ indexes for the key
const i = 0 const i = 0
ret = ret.setSingleValue(i, k, attrConfig.Fixed) // Support {inputURL} and {inputHostname} placeholders in fixed values
value := strings.ReplaceAll(attrConfig.Fixed, "{inputURL}", q.getURL())
value = strings.ReplaceAll(value, "{inputHostname}", extractHostname(q.getURL()))
ret = ret.setSingleValue(i, k, value)
} else { } else {
selector := attrConfig.Selector selector := attrConfig.Selector
selector = s.applyCommon(common, selector) selector = s.applyCommon(common, selector)
// Support {inputURL} and {inputHostname} placeholders in selectors
selector = strings.ReplaceAll(selector, "{inputURL}", q.getURL())
selector = strings.ReplaceAll(selector, "{inputHostname}", extractHostname(q.getURL()))
found, err := q.runQuery(selector) found, err := q.runQuery(selector)
if err != nil { if err != nil {

View file

@ -61,7 +61,7 @@ func (s *xpathScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCon
return nil, err return nil, err
} }
q := s.getXPathQuery(doc) q := s.getXPathQuery(doc, u)
// if these just return the return values from scraper.scrape* functions then // if these just return the return values from scraper.scrape* functions then
// it ends up returning ScrapedContent(nil) rather than nil // it ends up returning ScrapedContent(nil) rather than nil
switch ty { switch ty {
@ -121,7 +121,7 @@ func (s *xpathScraper) scrapeByName(ctx context.Context, name string, ty ScrapeC
return nil, err return nil, err
} }
q := s.getXPathQuery(doc) q := s.getXPathQuery(doc, url)
q.setType(SearchQuery) q.setType(SearchQuery)
var content []ScrapedContent var content []ScrapedContent
@ -171,7 +171,7 @@ func (s *xpathScraper) scrapeSceneByScene(ctx context.Context, scene *models.Sce
return nil, err return nil, err
} }
q := s.getXPathQuery(doc) q := s.getXPathQuery(doc, url)
return scraper.scrapeScene(ctx, q) return scraper.scrapeScene(ctx, q)
} }
@ -206,7 +206,7 @@ func (s *xpathScraper) scrapeByFragment(ctx context.Context, input Input) (Scrap
return nil, err return nil, err
} }
q := s.getXPathQuery(doc) q := s.getXPathQuery(doc, url)
return scraper.scrapeScene(ctx, q) return scraper.scrapeScene(ctx, q)
} }
@ -230,7 +230,7 @@ func (s *xpathScraper) scrapeGalleryByGallery(ctx context.Context, gallery *mode
return nil, err return nil, err
} }
q := s.getXPathQuery(doc) q := s.getXPathQuery(doc, url)
return scraper.scrapeGallery(ctx, q) return scraper.scrapeGallery(ctx, q)
} }
@ -254,7 +254,7 @@ func (s *xpathScraper) scrapeImageByImage(ctx context.Context, image *models.Ima
return nil, err return nil, err
} }
q := s.getXPathQuery(doc) q := s.getXPathQuery(doc, url)
return scraper.scrapeImage(ctx, q) return scraper.scrapeImage(ctx, q)
} }
@ -277,10 +277,11 @@ func (s *xpathScraper) loadURL(ctx context.Context, url string) (*html.Node, err
return ret, err return ret, err
} }
func (s *xpathScraper) getXPathQuery(doc *html.Node) *xpathQuery { func (s *xpathScraper) getXPathQuery(doc *html.Node, url string) *xpathQuery {
return &xpathQuery{ return &xpathQuery{
doc: doc, doc: doc,
scraper: s, scraper: s,
url: url,
} }
} }
@ -288,6 +289,7 @@ type xpathQuery struct {
doc *html.Node doc *html.Node
scraper *xpathScraper scraper *xpathScraper
queryType QueryType queryType QueryType
url string
} }
func (q *xpathQuery) getType() QueryType { func (q *xpathQuery) getType() QueryType {
@ -298,6 +300,10 @@ func (q *xpathQuery) setType(t QueryType) {
q.queryType = t q.queryType = t
} }
func (q *xpathQuery) getURL() string {
return q.url
}
func (q *xpathQuery) runQuery(selector string) ([]string, error) { func (q *xpathQuery) runQuery(selector string) ([]string, error) {
found, err := htmlquery.QueryAll(q.doc, selector) found, err := htmlquery.QueryAll(q.doc, selector)
if err != nil { if err != nil {
@ -346,5 +352,5 @@ func (q *xpathQuery) subScrape(ctx context.Context, value string) mappedQuery {
return nil return nil
} }
return q.scraper.getXPathQuery(doc) return q.scraper.getXPathQuery(doc, value)
} }

View file

@ -325,10 +325,58 @@ Alternatively, an attribute value may be set to a fixed value, rather than scrap
```yaml ```yaml
performer: performer:
Gender: Gender:
fixed: Female fixed: Female
``` ```
### Input URL placeholders
The `{inputURL}` and `{inputHostname}` placeholders can be used in both `fixed` values and `selector` expressions to access information about the original URL that was used to scrape the content.
#### {inputURL}
The `{inputURL}` placeholder provides access to the full URL. This is useful when you want to return or reference the source URL as part of the scraped data.
For example:
```yaml
scene:
URL:
fixed: "{inputURL}"
Title:
selector: //h1[@class="title"]
```
When scraping from `https://example.com/scene/12345`, the `{inputURL}` placeholder will be replaced with `https://example.com/scene/12345`.
#### {inputHostname}
The `{inputHostname}` placeholder extracts just the hostname from the URL. This is useful when you need to reference the domain without manually parsing the URL.
For example:
```yaml
scene:
Studio:
fixed: "{inputHostname}"
Details:
selector: //div[@data-domain="{inputHostname}"]//p[@class="description"]
```
When scraping from `https://example.com/scene/12345`, the `{inputHostname}` placeholder will be replaced with `example.com`.
These placeholders can also be used within selectors for more advanced use cases:
```yaml
scene:
Details:
selector: //div[@data-url="{inputURL}"]//p[@class="description"]
Site:
selector: //div[@data-host="{inputHostname}"]//span[@class="site-name"]
```
> **Note:** These placeholders represent the actual URL used to fetch the content, after any URL replacements have been applied.
### Common fragments ### Common fragments
The `common` field is used to configure selector fragments that can be referenced in the selector strings. These are key-value pairs where the key is the string to reference the fragment, and the value is the string that the fragment will be replaced with. For example: The `common` field is used to configure selector fragments that can be referenced in the selector strings. These are key-value pairs where the key is the string to reference the fragment, and the value is the string that the fragment will be replaced with. For example: