From 678b3de7c83f72a998ed1399ae340ad6c285adb6 Mon Sep 17 00:00:00 2001 From: Gykes <24581046+Gykes@users.noreply.github.com> Date: Sun, 9 Nov 2025 20:00:47 -0800 Subject: [PATCH] Feature: Support inputURL and inputHostname in scrapers (#6250) --- pkg/scraper/json.go | 22 +++++--- pkg/scraper/mapped.go | 26 +++++++++- pkg/scraper/xpath.go | 22 +++++--- .../src/docs/en/Manual/ScraperDevelopment.md | 50 ++++++++++++++++++- 4 files changed, 102 insertions(+), 18 deletions(-) diff --git a/pkg/scraper/json.go b/pkg/scraper/json.go index fc7eb17a2..9f479f1c2 100644 --- a/pkg/scraper/json.go +++ b/pkg/scraper/json.go @@ -80,7 +80,7 @@ func (s *jsonScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCont return nil, err } - q := s.getJsonQuery(doc) + q := s.getJsonQuery(doc, u) // if these just return the return values from scraper.scrape* functions then // it ends up returning ScrapedContent(nil) rather than nil switch ty { @@ -140,7 +140,7 @@ func (s *jsonScraper) scrapeByName(ctx context.Context, name string, ty ScrapeCo return nil, err } - q := s.getJsonQuery(doc) + q := s.getJsonQuery(doc, url) q.setType(SearchQuery) var content []ScrapedContent @@ -192,7 +192,7 @@ func (s *jsonScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scen return nil, err } - q := s.getJsonQuery(doc) + q := s.getJsonQuery(doc, url) return scraper.scrapeScene(ctx, q) } @@ -227,7 +227,7 @@ func (s *jsonScraper) scrapeByFragment(ctx context.Context, input Input) (Scrape return nil, err } - q := s.getJsonQuery(doc) + q := s.getJsonQuery(doc, url) return scraper.scrapeScene(ctx, q) } @@ -251,7 +251,7 @@ func (s *jsonScraper) scrapeImageByImage(ctx context.Context, image *models.Imag return nil, err } - q := s.getJsonQuery(doc) + q := s.getJsonQuery(doc, url) return scraper.scrapeImage(ctx, q) } @@ -275,14 +275,15 @@ func (s *jsonScraper) scrapeGalleryByGallery(ctx context.Context, gallery *model return nil, err } - q := s.getJsonQuery(doc) + q := s.getJsonQuery(doc, url) return scraper.scrapeGallery(ctx, q) } -func (s *jsonScraper) getJsonQuery(doc string) *jsonQuery { +func (s *jsonScraper) getJsonQuery(doc string, url string) *jsonQuery { return &jsonQuery{ doc: doc, scraper: s, + url: url, } } @@ -290,6 +291,7 @@ type jsonQuery struct { doc string scraper *jsonScraper queryType QueryType + url string } func (q *jsonQuery) getType() QueryType { @@ -300,6 +302,10 @@ func (q *jsonQuery) setType(t QueryType) { q.queryType = t } +func (q *jsonQuery) getURL() string { + return q.url +} + func (q *jsonQuery) runQuery(selector string) ([]string, error) { value := gjson.Get(q.doc, selector) @@ -331,5 +337,5 @@ func (q *jsonQuery) subScrape(ctx context.Context, value string) mappedQuery { return nil } - return q.scraper.getJsonQuery(doc) + return q.scraper.getJsonQuery(doc, value) } diff --git a/pkg/scraper/mapped.go b/pkg/scraper/mapped.go index f89499176..3fac22ec3 100644 --- a/pkg/scraper/mapped.go +++ b/pkg/scraper/mapped.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "math" + "net/url" "reflect" "regexp" "strconv" @@ -24,6 +25,7 @@ type mappedQuery interface { getType() QueryType setType(QueryType) subScrape(ctx context.Context, value string) mappedQuery + getURL() string } type commonMappedConfig map[string]string @@ -43,6 +45,22 @@ func (s mappedConfig) applyCommon(c commonMappedConfig, src string) string { return ret } +// extractHostname parses a URL string and returns the hostname. +// Returns empty string if the URL cannot be parsed. +func extractHostname(urlStr string) string { + if urlStr == "" { + return "" + } + + u, err := url.Parse(urlStr) + if err != nil { + logger.Warnf("Error parsing URL '%s': %s", urlStr, err.Error()) + return "" + } + + return u.Hostname() +} + type isMultiFunc func(key string) bool func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig, isMulti isMultiFunc) mappedResults { @@ -53,10 +71,16 @@ func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonM if attrConfig.Fixed != "" { // TODO - not sure if this needs to set _all_ indexes for the key const i = 0 - ret = ret.setSingleValue(i, k, attrConfig.Fixed) + // Support {inputURL} and {inputHostname} placeholders in fixed values + value := strings.ReplaceAll(attrConfig.Fixed, "{inputURL}", q.getURL()) + value = strings.ReplaceAll(value, "{inputHostname}", extractHostname(q.getURL())) + ret = ret.setSingleValue(i, k, value) } else { selector := attrConfig.Selector selector = s.applyCommon(common, selector) + // Support {inputURL} and {inputHostname} placeholders in selectors + selector = strings.ReplaceAll(selector, "{inputURL}", q.getURL()) + selector = strings.ReplaceAll(selector, "{inputHostname}", extractHostname(q.getURL())) found, err := q.runQuery(selector) if err != nil { diff --git a/pkg/scraper/xpath.go b/pkg/scraper/xpath.go index 9993aa3ff..e042c861a 100644 --- a/pkg/scraper/xpath.go +++ b/pkg/scraper/xpath.go @@ -61,7 +61,7 @@ func (s *xpathScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCon return nil, err } - q := s.getXPathQuery(doc) + q := s.getXPathQuery(doc, u) // if these just return the return values from scraper.scrape* functions then // it ends up returning ScrapedContent(nil) rather than nil switch ty { @@ -121,7 +121,7 @@ func (s *xpathScraper) scrapeByName(ctx context.Context, name string, ty ScrapeC return nil, err } - q := s.getXPathQuery(doc) + q := s.getXPathQuery(doc, url) q.setType(SearchQuery) var content []ScrapedContent @@ -171,7 +171,7 @@ func (s *xpathScraper) scrapeSceneByScene(ctx context.Context, scene *models.Sce return nil, err } - q := s.getXPathQuery(doc) + q := s.getXPathQuery(doc, url) return scraper.scrapeScene(ctx, q) } @@ -206,7 +206,7 @@ func (s *xpathScraper) scrapeByFragment(ctx context.Context, input Input) (Scrap return nil, err } - q := s.getXPathQuery(doc) + q := s.getXPathQuery(doc, url) return scraper.scrapeScene(ctx, q) } @@ -230,7 +230,7 @@ func (s *xpathScraper) scrapeGalleryByGallery(ctx context.Context, gallery *mode return nil, err } - q := s.getXPathQuery(doc) + q := s.getXPathQuery(doc, url) return scraper.scrapeGallery(ctx, q) } @@ -254,7 +254,7 @@ func (s *xpathScraper) scrapeImageByImage(ctx context.Context, image *models.Ima return nil, err } - q := s.getXPathQuery(doc) + q := s.getXPathQuery(doc, url) return scraper.scrapeImage(ctx, q) } @@ -277,10 +277,11 @@ func (s *xpathScraper) loadURL(ctx context.Context, url string) (*html.Node, err return ret, err } -func (s *xpathScraper) getXPathQuery(doc *html.Node) *xpathQuery { +func (s *xpathScraper) getXPathQuery(doc *html.Node, url string) *xpathQuery { return &xpathQuery{ doc: doc, scraper: s, + url: url, } } @@ -288,6 +289,7 @@ type xpathQuery struct { doc *html.Node scraper *xpathScraper queryType QueryType + url string } func (q *xpathQuery) getType() QueryType { @@ -298,6 +300,10 @@ func (q *xpathQuery) setType(t QueryType) { q.queryType = t } +func (q *xpathQuery) getURL() string { + return q.url +} + func (q *xpathQuery) runQuery(selector string) ([]string, error) { found, err := htmlquery.QueryAll(q.doc, selector) if err != nil { @@ -346,5 +352,5 @@ func (q *xpathQuery) subScrape(ctx context.Context, value string) mappedQuery { return nil } - return q.scraper.getXPathQuery(doc) + return q.scraper.getXPathQuery(doc, value) } diff --git a/ui/v2.5/src/docs/en/Manual/ScraperDevelopment.md b/ui/v2.5/src/docs/en/Manual/ScraperDevelopment.md index 0ee1c0880..bd87d71ab 100644 --- a/ui/v2.5/src/docs/en/Manual/ScraperDevelopment.md +++ b/ui/v2.5/src/docs/en/Manual/ScraperDevelopment.md @@ -325,10 +325,58 @@ Alternatively, an attribute value may be set to a fixed value, rather than scrap ```yaml performer: - Gender: + Gender: fixed: Female ``` +### Input URL placeholders + +The `{inputURL}` and `{inputHostname}` placeholders can be used in both `fixed` values and `selector` expressions to access information about the original URL that was used to scrape the content. + +#### {inputURL} + +The `{inputURL}` placeholder provides access to the full URL. This is useful when you want to return or reference the source URL as part of the scraped data. + +For example: + +```yaml +scene: + URL: + fixed: "{inputURL}" + Title: + selector: //h1[@class="title"] +``` + +When scraping from `https://example.com/scene/12345`, the `{inputURL}` placeholder will be replaced with `https://example.com/scene/12345`. + +#### {inputHostname} + +The `{inputHostname}` placeholder extracts just the hostname from the URL. This is useful when you need to reference the domain without manually parsing the URL. + +For example: + +```yaml +scene: + Studio: + fixed: "{inputHostname}" + Details: + selector: //div[@data-domain="{inputHostname}"]//p[@class="description"] +``` + +When scraping from `https://example.com/scene/12345`, the `{inputHostname}` placeholder will be replaced with `example.com`. + +These placeholders can also be used within selectors for more advanced use cases: + +```yaml +scene: + Details: + selector: //div[@data-url="{inputURL}"]//p[@class="description"] + Site: + selector: //div[@data-host="{inputHostname}"]//span[@class="site-name"] +``` + +> **Note:** These placeholders represent the actual URL used to fetch the content, after any URL replacements have been applied. + ### Common fragments The `common` field is used to configure selector fragments that can be referenced in the selector strings. These are key-value pairs where the key is the string to reference the fragment, and the value is the string that the fragment will be replaced with. For example: