Feature: Support inputURL and inputHostname in scrapers (#6250)

This commit is contained in:
Gykes 2025-11-09 20:00:47 -08:00 committed by GitHub
parent f434c1f529
commit 678b3de7c8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 102 additions and 18 deletions

View file

@ -80,7 +80,7 @@ func (s *jsonScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCont
return nil, err
}
q := s.getJsonQuery(doc)
q := s.getJsonQuery(doc, u)
// if these just return the return values from scraper.scrape* functions then
// it ends up returning ScrapedContent(nil) rather than nil
switch ty {
@ -140,7 +140,7 @@ func (s *jsonScraper) scrapeByName(ctx context.Context, name string, ty ScrapeCo
return nil, err
}
q := s.getJsonQuery(doc)
q := s.getJsonQuery(doc, url)
q.setType(SearchQuery)
var content []ScrapedContent
@ -192,7 +192,7 @@ func (s *jsonScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scen
return nil, err
}
q := s.getJsonQuery(doc)
q := s.getJsonQuery(doc, url)
return scraper.scrapeScene(ctx, q)
}
@ -227,7 +227,7 @@ func (s *jsonScraper) scrapeByFragment(ctx context.Context, input Input) (Scrape
return nil, err
}
q := s.getJsonQuery(doc)
q := s.getJsonQuery(doc, url)
return scraper.scrapeScene(ctx, q)
}
@ -251,7 +251,7 @@ func (s *jsonScraper) scrapeImageByImage(ctx context.Context, image *models.Imag
return nil, err
}
q := s.getJsonQuery(doc)
q := s.getJsonQuery(doc, url)
return scraper.scrapeImage(ctx, q)
}
@ -275,14 +275,15 @@ func (s *jsonScraper) scrapeGalleryByGallery(ctx context.Context, gallery *model
return nil, err
}
q := s.getJsonQuery(doc)
q := s.getJsonQuery(doc, url)
return scraper.scrapeGallery(ctx, q)
}
func (s *jsonScraper) getJsonQuery(doc string) *jsonQuery {
func (s *jsonScraper) getJsonQuery(doc string, url string) *jsonQuery {
return &jsonQuery{
doc: doc,
scraper: s,
url: url,
}
}
@ -290,6 +291,7 @@ type jsonQuery struct {
doc string
scraper *jsonScraper
queryType QueryType
url string
}
func (q *jsonQuery) getType() QueryType {
@ -300,6 +302,10 @@ func (q *jsonQuery) setType(t QueryType) {
q.queryType = t
}
func (q *jsonQuery) getURL() string {
return q.url
}
func (q *jsonQuery) runQuery(selector string) ([]string, error) {
value := gjson.Get(q.doc, selector)
@ -331,5 +337,5 @@ func (q *jsonQuery) subScrape(ctx context.Context, value string) mappedQuery {
return nil
}
return q.scraper.getJsonQuery(doc)
return q.scraper.getJsonQuery(doc, value)
}

View file

@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"math"
"net/url"
"reflect"
"regexp"
"strconv"
@ -24,6 +25,7 @@ type mappedQuery interface {
getType() QueryType
setType(QueryType)
subScrape(ctx context.Context, value string) mappedQuery
getURL() string
}
type commonMappedConfig map[string]string
@ -43,6 +45,22 @@ func (s mappedConfig) applyCommon(c commonMappedConfig, src string) string {
return ret
}
// extractHostname parses a URL string and returns the hostname.
// Returns empty string if the URL cannot be parsed.
func extractHostname(urlStr string) string {
if urlStr == "" {
return ""
}
u, err := url.Parse(urlStr)
if err != nil {
logger.Warnf("Error parsing URL '%s': %s", urlStr, err.Error())
return ""
}
return u.Hostname()
}
type isMultiFunc func(key string) bool
func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig, isMulti isMultiFunc) mappedResults {
@ -53,10 +71,16 @@ func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonM
if attrConfig.Fixed != "" {
// TODO - not sure if this needs to set _all_ indexes for the key
const i = 0
ret = ret.setSingleValue(i, k, attrConfig.Fixed)
// Support {inputURL} and {inputHostname} placeholders in fixed values
value := strings.ReplaceAll(attrConfig.Fixed, "{inputURL}", q.getURL())
value = strings.ReplaceAll(value, "{inputHostname}", extractHostname(q.getURL()))
ret = ret.setSingleValue(i, k, value)
} else {
selector := attrConfig.Selector
selector = s.applyCommon(common, selector)
// Support {inputURL} and {inputHostname} placeholders in selectors
selector = strings.ReplaceAll(selector, "{inputURL}", q.getURL())
selector = strings.ReplaceAll(selector, "{inputHostname}", extractHostname(q.getURL()))
found, err := q.runQuery(selector)
if err != nil {

View file

@ -61,7 +61,7 @@ func (s *xpathScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCon
return nil, err
}
q := s.getXPathQuery(doc)
q := s.getXPathQuery(doc, u)
// if these just return the return values from scraper.scrape* functions then
// it ends up returning ScrapedContent(nil) rather than nil
switch ty {
@ -121,7 +121,7 @@ func (s *xpathScraper) scrapeByName(ctx context.Context, name string, ty ScrapeC
return nil, err
}
q := s.getXPathQuery(doc)
q := s.getXPathQuery(doc, url)
q.setType(SearchQuery)
var content []ScrapedContent
@ -171,7 +171,7 @@ func (s *xpathScraper) scrapeSceneByScene(ctx context.Context, scene *models.Sce
return nil, err
}
q := s.getXPathQuery(doc)
q := s.getXPathQuery(doc, url)
return scraper.scrapeScene(ctx, q)
}
@ -206,7 +206,7 @@ func (s *xpathScraper) scrapeByFragment(ctx context.Context, input Input) (Scrap
return nil, err
}
q := s.getXPathQuery(doc)
q := s.getXPathQuery(doc, url)
return scraper.scrapeScene(ctx, q)
}
@ -230,7 +230,7 @@ func (s *xpathScraper) scrapeGalleryByGallery(ctx context.Context, gallery *mode
return nil, err
}
q := s.getXPathQuery(doc)
q := s.getXPathQuery(doc, url)
return scraper.scrapeGallery(ctx, q)
}
@ -254,7 +254,7 @@ func (s *xpathScraper) scrapeImageByImage(ctx context.Context, image *models.Ima
return nil, err
}
q := s.getXPathQuery(doc)
q := s.getXPathQuery(doc, url)
return scraper.scrapeImage(ctx, q)
}
@ -277,10 +277,11 @@ func (s *xpathScraper) loadURL(ctx context.Context, url string) (*html.Node, err
return ret, err
}
func (s *xpathScraper) getXPathQuery(doc *html.Node) *xpathQuery {
func (s *xpathScraper) getXPathQuery(doc *html.Node, url string) *xpathQuery {
return &xpathQuery{
doc: doc,
scraper: s,
url: url,
}
}
@ -288,6 +289,7 @@ type xpathQuery struct {
doc *html.Node
scraper *xpathScraper
queryType QueryType
url string
}
func (q *xpathQuery) getType() QueryType {
@ -298,6 +300,10 @@ func (q *xpathQuery) setType(t QueryType) {
q.queryType = t
}
func (q *xpathQuery) getURL() string {
return q.url
}
func (q *xpathQuery) runQuery(selector string) ([]string, error) {
found, err := htmlquery.QueryAll(q.doc, selector)
if err != nil {
@ -346,5 +352,5 @@ func (q *xpathQuery) subScrape(ctx context.Context, value string) mappedQuery {
return nil
}
return q.scraper.getXPathQuery(doc)
return q.scraper.getXPathQuery(doc, value)
}

View file

@ -325,10 +325,58 @@ Alternatively, an attribute value may be set to a fixed value, rather than scrap
```yaml
performer:
Gender:
Gender:
fixed: Female
```
### Input URL placeholders
The `{inputURL}` and `{inputHostname}` placeholders can be used in both `fixed` values and `selector` expressions to access information about the original URL that was used to scrape the content.
#### {inputURL}
The `{inputURL}` placeholder provides access to the full URL. This is useful when you want to return or reference the source URL as part of the scraped data.
For example:
```yaml
scene:
URL:
fixed: "{inputURL}"
Title:
selector: //h1[@class="title"]
```
When scraping from `https://example.com/scene/12345`, the `{inputURL}` placeholder will be replaced with `https://example.com/scene/12345`.
#### {inputHostname}
The `{inputHostname}` placeholder extracts just the hostname from the URL. This is useful when you need to reference the domain without manually parsing the URL.
For example:
```yaml
scene:
Studio:
fixed: "{inputHostname}"
Details:
selector: //div[@data-domain="{inputHostname}"]//p[@class="description"]
```
When scraping from `https://example.com/scene/12345`, the `{inputHostname}` placeholder will be replaced with `example.com`.
These placeholders can also be used within selectors for more advanced use cases:
```yaml
scene:
Details:
selector: //div[@data-url="{inputURL}"]//p[@class="description"]
Site:
selector: //div[@data-host="{inputHostname}"]//span[@class="site-name"]
```
> **Note:** These placeholders represent the actual URL used to fetch the content, after any URL replacements have been applied.
### Common fragments
The `common` field is used to configure selector fragments that can be referenced in the selector strings. These are key-value pairs where the key is the string to reference the fragment, and the value is the string that the fragment will be replaced with. For example: