Refactor scraper package (#6495)

* Remove reflection from mapped value processing
* AI generated unit tests
* Move mappedConfig to separate file
* Rename group to configScraper
* Separate mapped post-processing code into separate file
* Update test after group rename
* Check map entry when returning scraper
* Refactor config into definition
* Support single string for string slice translation
* Rename config.go to definition.go
* Rename configScraper to definedScraper
* Rename config_scraper.go to defined_scraper.go
This commit is contained in:
WithoutPants 2026-02-04 11:07:51 +11:00 committed by GitHub
parent ed0fb53ae0
commit 88eb46380c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 2475 additions and 1324 deletions

View file

@ -24,9 +24,85 @@ func (e scraperAction) IsValid() bool {
return false
}
type scraperActionImpl interface {
type urlScraperActionImpl interface {
scrapeByURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error)
}
func (c Definition) getURLScraper(def ByURLDefinition, client *http.Client, globalConfig GlobalConfig) urlScraperActionImpl {
switch def.Action {
case scraperActionScript:
return &scriptURLScraper{
scriptScraper: scriptScraper{
definition: c,
globalConfig: globalConfig,
},
definition: def,
}
case scraperActionStash:
return newStashScraper(client, c, globalConfig)
case scraperActionXPath:
return &xpathURLScraper{
xpathScraper: xpathScraper{
definition: c,
globalConfig: globalConfig,
client: client,
},
definition: def,
}
case scraperActionJson:
return &jsonURLScraper{
jsonScraper: jsonScraper{
definition: c,
globalConfig: globalConfig,
client: client,
},
definition: def,
}
}
panic("unknown scraper action: " + def.Action)
}
type nameScraperActionImpl interface {
scrapeByName(ctx context.Context, name string, ty ScrapeContentType) ([]ScrapedContent, error)
}
func (c Definition) getNameScraper(def ByNameDefinition, client *http.Client, globalConfig GlobalConfig) nameScraperActionImpl {
switch def.Action {
case scraperActionScript:
return &scriptNameScraper{
scriptScraper: scriptScraper{
definition: c,
globalConfig: globalConfig,
},
definition: def,
}
case scraperActionStash:
return newStashScraper(client, c, globalConfig)
case scraperActionXPath:
return &xpathNameScraper{
xpathScraper: xpathScraper{
definition: c,
globalConfig: globalConfig,
client: client,
},
definition: def,
}
case scraperActionJson:
return &jsonNameScraper{
jsonScraper: jsonScraper{
definition: c,
globalConfig: globalConfig,
client: client,
},
definition: def,
}
}
panic("unknown scraper action: " + def.Action)
}
type fragmentScraperActionImpl interface {
scrapeByFragment(ctx context.Context, input Input) (ScrapedContent, error)
scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error)
@ -34,17 +110,37 @@ type scraperActionImpl interface {
scrapeImageByImage(ctx context.Context, image *models.Image) (*models.ScrapedImage, error)
}
func (c config) getScraper(scraper scraperTypeConfig, client *http.Client, globalConfig GlobalConfig) scraperActionImpl {
switch scraper.Action {
func (c Definition) getFragmentScraper(actionDef ByFragmentDefinition, client *http.Client, globalConfig GlobalConfig) fragmentScraperActionImpl {
switch actionDef.Action {
case scraperActionScript:
return newScriptScraper(scraper, c, globalConfig)
return &scriptFragmentScraper{
scriptScraper: scriptScraper{
definition: c,
globalConfig: globalConfig,
},
definition: actionDef,
}
case scraperActionStash:
return newStashScraper(scraper, client, c, globalConfig)
return newStashScraper(client, c, globalConfig)
case scraperActionXPath:
return newXpathScraper(scraper, client, c, globalConfig)
return &xpathFragmentScraper{
xpathScraper: xpathScraper{
definition: c,
globalConfig: globalConfig,
client: client,
},
definition: actionDef,
}
case scraperActionJson:
return newJsonScraper(scraper, client, c, globalConfig)
return &jsonFragmentScraper{
jsonScraper: jsonScraper{
definition: c,
globalConfig: globalConfig,
client: client,
},
definition: actionDef,
}
}
panic("unknown scraper action: " + scraper.Action)
panic("unknown scraper action: " + actionDef.Action)
}

View file

@ -182,7 +182,7 @@ func (c *Cache) ReloadScrapers() {
if err != nil {
logger.Errorf("Error loading scraper %s: %v", fp, err)
} else {
scraper := newGroupScraper(*conf, c.globalConfig)
scraper := scraperFromDefinition(*conf, c.globalConfig)
scrapers[scraper.spec().ID] = scraper
}
}

View file

@ -18,7 +18,7 @@ import (
)
// jar constructs a cookie jar from a configuration
func (c config) jar() (*cookiejar.Jar, error) {
func (c Definition) jar() (*cookiejar.Jar, error) {
opts := c.DriverOptions
jar, err := cookiejar.New(&cookiejar.Options{
PublicSuffixList: publicsuffix.List,
@ -77,7 +77,7 @@ func randomSequence(n int) string {
}
// printCookies prints all cookies from the given cookie jar
func printCookies(jar *cookiejar.Jar, scraperConfig config, msg string) {
func printCookies(jar *cookiejar.Jar, scraperConfig Definition, msg string) {
driverOptions := scraperConfig.DriverOptions
if driverOptions != nil && !driverOptions.UseCDP {
var foundURLs []*url.URL

View file

@ -8,25 +8,26 @@ import (
"github.com/stashapp/stash/pkg/models"
)
type group struct {
config config
// definedScraper implements the scraper interface using a Definition object.
type definedScraper struct {
config Definition
globalConf GlobalConfig
}
func newGroupScraper(c config, globalConfig GlobalConfig) scraper {
return group{
func scraperFromDefinition(c Definition, globalConfig GlobalConfig) definedScraper {
return definedScraper{
config: c,
globalConf: globalConfig,
}
}
func (g group) spec() Scraper {
func (g definedScraper) spec() Scraper {
return g.config.spec()
}
// fragmentScraper finds an appropriate fragment scraper based on input.
func (g group) fragmentScraper(input Input) *scraperTypeConfig {
func (g definedScraper) fragmentScraper(input Input) *ByFragmentDefinition {
switch {
case input.Performer != nil:
return g.config.PerformerByFragment
@ -43,7 +44,7 @@ func (g group) fragmentScraper(input Input) *scraperTypeConfig {
return nil
}
func (g group) viaFragment(ctx context.Context, client *http.Client, input Input) (ScrapedContent, error) {
func (g definedScraper) viaFragment(ctx context.Context, client *http.Client, input Input) (ScrapedContent, error) {
stc := g.fragmentScraper(input)
if stc == nil {
// If there's no performer fragment scraper in the group, we try to use
@ -56,38 +57,38 @@ func (g group) viaFragment(ctx context.Context, client *http.Client, input Input
return nil, ErrNotSupported
}
s := g.config.getScraper(*stc, client, g.globalConf)
s := g.config.getFragmentScraper(*stc, client, g.globalConf)
return s.scrapeByFragment(ctx, input)
}
func (g group) viaScene(ctx context.Context, client *http.Client, scene *models.Scene) (*models.ScrapedScene, error) {
func (g definedScraper) viaScene(ctx context.Context, client *http.Client, scene *models.Scene) (*models.ScrapedScene, error) {
if g.config.SceneByFragment == nil {
return nil, ErrNotSupported
}
s := g.config.getScraper(*g.config.SceneByFragment, client, g.globalConf)
s := g.config.getFragmentScraper(*g.config.SceneByFragment, client, g.globalConf)
return s.scrapeSceneByScene(ctx, scene)
}
func (g group) viaGallery(ctx context.Context, client *http.Client, gallery *models.Gallery) (*models.ScrapedGallery, error) {
func (g definedScraper) viaGallery(ctx context.Context, client *http.Client, gallery *models.Gallery) (*models.ScrapedGallery, error) {
if g.config.GalleryByFragment == nil {
return nil, ErrNotSupported
}
s := g.config.getScraper(*g.config.GalleryByFragment, client, g.globalConf)
s := g.config.getFragmentScraper(*g.config.GalleryByFragment, client, g.globalConf)
return s.scrapeGalleryByGallery(ctx, gallery)
}
func (g group) viaImage(ctx context.Context, client *http.Client, gallery *models.Image) (*models.ScrapedImage, error) {
func (g definedScraper) viaImage(ctx context.Context, client *http.Client, gallery *models.Image) (*models.ScrapedImage, error) {
if g.config.ImageByFragment == nil {
return nil, ErrNotSupported
}
s := g.config.getScraper(*g.config.ImageByFragment, client, g.globalConf)
s := g.config.getFragmentScraper(*g.config.ImageByFragment, client, g.globalConf)
return s.scrapeImageByImage(ctx, gallery)
}
func loadUrlCandidates(c config, ty ScrapeContentType) []*scrapeByURLConfig {
func loadUrlCandidates(c Definition, ty ScrapeContentType) []*ByURLDefinition {
switch ty {
case ScrapeContentTypePerformer:
return c.PerformerByURL
@ -104,12 +105,13 @@ func loadUrlCandidates(c config, ty ScrapeContentType) []*scrapeByURLConfig {
panic("loadUrlCandidates: unreachable")
}
func (g group) viaURL(ctx context.Context, client *http.Client, url string, ty ScrapeContentType) (ScrapedContent, error) {
func (g definedScraper) viaURL(ctx context.Context, client *http.Client, url string, ty ScrapeContentType) (ScrapedContent, error) {
candidates := loadUrlCandidates(g.config, ty)
for _, scraper := range candidates {
if scraper.matchesURL(url) {
s := g.config.getScraper(scraper.scraperTypeConfig, client, g.globalConf)
ret, err := s.scrapeByURL(ctx, url, ty)
u := replaceURL(url, *scraper) // allow a URL Replace for url-queries
s := g.config.getURLScraper(*scraper, client, g.globalConf)
ret, err := s.scrapeByURL(ctx, u, ty)
if err != nil {
return nil, err
}
@ -123,31 +125,31 @@ func (g group) viaURL(ctx context.Context, client *http.Client, url string, ty S
return nil, nil
}
func (g group) viaName(ctx context.Context, client *http.Client, name string, ty ScrapeContentType) ([]ScrapedContent, error) {
func (g definedScraper) viaName(ctx context.Context, client *http.Client, name string, ty ScrapeContentType) ([]ScrapedContent, error) {
switch ty {
case ScrapeContentTypePerformer:
if g.config.PerformerByName == nil {
break
}
s := g.config.getScraper(*g.config.PerformerByName, client, g.globalConf)
s := g.config.getNameScraper(*g.config.PerformerByName, client, g.globalConf)
return s.scrapeByName(ctx, name, ty)
case ScrapeContentTypeScene:
if g.config.SceneByName == nil {
break
}
s := g.config.getScraper(*g.config.SceneByName, client, g.globalConf)
s := g.config.getNameScraper(*g.config.SceneByName, client, g.globalConf)
return s.scrapeByName(ctx, name, ty)
}
return nil, fmt.Errorf("%w: cannot load %v by name", ErrNotSupported, ty)
}
func (g group) supports(ty ScrapeContentType) bool {
func (g definedScraper) supports(ty ScrapeContentType) bool {
return g.config.supports(ty)
}
func (g group) supportsURL(url string, ty ScrapeContentType) bool {
func (g definedScraper) supportsURL(url string, ty ScrapeContentType) bool {
return g.config.matchesURL(url, ty)
}

View file

@ -11,7 +11,8 @@ import (
"gopkg.in/yaml.v2"
)
type config struct {
// Definition represents a scraper definition (typically) loaded from a YAML configuration file.
type Definition struct {
ID string
path string
@ -19,43 +20,43 @@ type config struct {
Name string `yaml:"name"`
// Configuration for querying performers by name
PerformerByName *scraperTypeConfig `yaml:"performerByName"`
PerformerByName *ByNameDefinition `yaml:"performerByName"`
// Configuration for querying performers by a Performer fragment
PerformerByFragment *scraperTypeConfig `yaml:"performerByFragment"`
PerformerByFragment *ByFragmentDefinition `yaml:"performerByFragment"`
// Configuration for querying a performer by a URL
PerformerByURL []*scrapeByURLConfig `yaml:"performerByURL"`
PerformerByURL []*ByURLDefinition `yaml:"performerByURL"`
// Configuration for querying scenes by a Scene fragment
SceneByFragment *scraperTypeConfig `yaml:"sceneByFragment"`
SceneByFragment *ByFragmentDefinition `yaml:"sceneByFragment"`
// Configuration for querying gallery by a Gallery fragment
GalleryByFragment *scraperTypeConfig `yaml:"galleryByFragment"`
GalleryByFragment *ByFragmentDefinition `yaml:"galleryByFragment"`
// Configuration for querying scenes by name
SceneByName *scraperTypeConfig `yaml:"sceneByName"`
SceneByName *ByNameDefinition `yaml:"sceneByName"`
// Configuration for querying scenes by query fragment
SceneByQueryFragment *scraperTypeConfig `yaml:"sceneByQueryFragment"`
SceneByQueryFragment *ByFragmentDefinition `yaml:"sceneByQueryFragment"`
// Configuration for querying a scene by a URL
SceneByURL []*scrapeByURLConfig `yaml:"sceneByURL"`
SceneByURL []*ByURLDefinition `yaml:"sceneByURL"`
// Configuration for querying a gallery by a URL
GalleryByURL []*scrapeByURLConfig `yaml:"galleryByURL"`
GalleryByURL []*ByURLDefinition `yaml:"galleryByURL"`
// Configuration for querying an image by a URL
ImageByURL []*scrapeByURLConfig `yaml:"imageByURL"`
ImageByURL []*ByURLDefinition `yaml:"imageByURL"`
// Configuration for querying image by an Image fragment
ImageByFragment *scraperTypeConfig `yaml:"imageByFragment"`
ImageByFragment *ByFragmentDefinition `yaml:"imageByFragment"`
// Configuration for querying a movie by a URL - deprecated, use GroupByURL
MovieByURL []*scrapeByURLConfig `yaml:"movieByURL"`
MovieByURL []*ByURLDefinition `yaml:"movieByURL"`
// Configuration for querying a group by a URL
GroupByURL []*scrapeByURLConfig `yaml:"groupByURL"`
GroupByURL []*ByURLDefinition `yaml:"groupByURL"`
// Scraper debugging options
DebugOptions *scraperDebugOptions `yaml:"debug"`
@ -73,7 +74,7 @@ type config struct {
DriverOptions *scraperDriverOptions `yaml:"driver"`
}
func (c config) validate() error {
func (c Definition) validate() error {
if strings.TrimSpace(c.Name) == "" {
return errors.New("name must not be empty")
}
@ -126,17 +127,13 @@ type stashServer struct {
ApiKey string `yaml:"apiKey"`
}
type scraperTypeConfig struct {
type ActionDefinition struct {
Action scraperAction `yaml:"action"`
Script []string `yaml:"script,flow"`
Scraper string `yaml:"scraper"`
// for xpath name scraper only
QueryURL string `yaml:"queryURL"`
QueryURLReplacements queryURLReplacements `yaml:"queryURLReplace"`
}
func (c scraperTypeConfig) validate() error {
func (c ActionDefinition) validate() error {
if !c.Action.IsValid() {
return fmt.Errorf("%s is not a valid scraper action", c.Action)
}
@ -148,20 +145,22 @@ func (c scraperTypeConfig) validate() error {
return nil
}
type scrapeByURLConfig struct {
scraperTypeConfig `yaml:",inline"`
URL []string `yaml:"url,flow"`
type ByURLDefinition struct {
ActionDefinition `yaml:",inline"`
URL []string `yaml:"url,flow"`
QueryURL string `yaml:"queryURL"`
QueryURLReplacements queryURLReplacements `yaml:"queryURLReplace"`
}
func (c scrapeByURLConfig) validate() error {
func (c ByURLDefinition) validate() error {
if len(c.URL) == 0 {
return errors.New("url is mandatory for scrape by url scrapers")
}
return c.scraperTypeConfig.validate()
return c.ActionDefinition.validate()
}
func (c scrapeByURLConfig) matchesURL(url string) bool {
func (c ByURLDefinition) matchesURL(url string) bool {
for _, thisURL := range c.URL {
if strings.Contains(url, thisURL) {
return true
@ -171,6 +170,18 @@ func (c scrapeByURLConfig) matchesURL(url string) bool {
return false
}
type ByFragmentDefinition struct {
ActionDefinition `yaml:",inline"`
QueryURL string `yaml:"queryURL"`
QueryURLReplacements queryURLReplacements `yaml:"queryURLReplace"`
}
type ByNameDefinition struct {
ActionDefinition `yaml:",inline"`
QueryURL string `yaml:"queryURL"`
}
type scraperDebugOptions struct {
PrintHTML bool `yaml:"printHTML"`
}
@ -206,8 +217,8 @@ type scraperDriverOptions struct {
Headers []*header `yaml:"headers"`
}
func loadConfigFromYAML(id string, reader io.Reader) (*config, error) {
ret := &config{}
func loadConfigFromYAML(id string, reader io.Reader) (*Definition, error) {
ret := &Definition{}
parser := yaml.NewDecoder(reader)
parser.SetStrict(true)
@ -225,7 +236,7 @@ func loadConfigFromYAML(id string, reader io.Reader) (*config, error) {
return ret, nil
}
func loadConfigFromYAMLFile(path string) (*config, error) {
func loadConfigFromYAMLFile(path string) (*Definition, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
@ -246,7 +257,7 @@ func loadConfigFromYAMLFile(path string) (*config, error) {
return ret, nil
}
func (c config) spec() Scraper {
func (c Definition) spec() Scraper {
ret := Scraper{
ID: c.ID,
Name: c.Name,
@ -334,7 +345,7 @@ func (c config) spec() Scraper {
return ret
}
func (c config) supports(ty ScrapeContentType) bool {
func (c Definition) supports(ty ScrapeContentType) bool {
switch ty {
case ScrapeContentTypePerformer:
return c.PerformerByName != nil || c.PerformerByFragment != nil || len(c.PerformerByURL) > 0
@ -351,7 +362,7 @@ func (c config) supports(ty ScrapeContentType) bool {
panic("Unhandled ScrapeContentType")
}
func (c config) matchesURL(url string, ty ScrapeContentType) bool {
func (c Definition) matchesURL(url string, ty ScrapeContentType) bool {
switch ty {
case ScrapeContentTypePerformer:
for _, scraper := range c.PerformerByURL {

View file

@ -139,5 +139,5 @@ func getFreeonesScraper(globalConfig GlobalConfig) scraper {
logger.Fatalf("Error loading builtin freeones scraper: %s", err.Error())
}
return newGroupScraper(*c, globalConfig)
return scraperFromDefinition(*c, globalConfig)
}

View file

@ -15,43 +15,22 @@ import (
)
type jsonScraper struct {
scraper scraperTypeConfig
config config
definition Definition
globalConfig GlobalConfig
client *http.Client
}
func newJsonScraper(scraper scraperTypeConfig, client *http.Client, config config, globalConfig GlobalConfig) *jsonScraper {
return &jsonScraper{
scraper: scraper,
config: config,
client: client,
globalConfig: globalConfig,
}
}
func (s *jsonScraper) getJsonScraper() *mappedScraper {
return s.config.JsonScrapers[s.scraper.Scraper]
}
func (s *jsonScraper) scrapeURL(ctx context.Context, url string) (string, *mappedScraper, error) {
scraper := s.getJsonScraper()
if scraper == nil {
return "", nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
func (s *jsonScraper) getJsonScraper(name string) (*mappedScraper, error) {
ret, ok := s.definition.JsonScrapers[name]
if !ok {
return nil, fmt.Errorf("json scraper with name %s not found in config", name)
}
doc, err := s.loadURL(ctx, url)
if err != nil {
return "", nil, err
}
return doc, scraper, nil
return &ret, nil
}
func (s *jsonScraper) loadURL(ctx context.Context, url string) (string, error) {
r, err := loadURL(ctx, url, s.client, s.config, s.globalConfig)
r, err := loadURL(ctx, url, s.client, s.definition, s.globalConfig)
if err != nil {
return "", err
}
@ -66,21 +45,30 @@ func (s *jsonScraper) loadURL(ctx context.Context, url string) (string, error) {
return "", errors.New("not valid json")
}
if s.config.DebugOptions != nil && s.config.DebugOptions.PrintHTML {
if s.definition.DebugOptions != nil && s.definition.DebugOptions.PrintHTML {
logger.Infof("loadURL (%s) response: \n%s", url, docStr)
}
return docStr, err
}
func (s *jsonScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error) {
u := replaceURL(url, s.scraper) // allow a URL Replace for url-queries
doc, scraper, err := s.scrapeURL(ctx, u)
type jsonURLScraper struct {
jsonScraper
definition ByURLDefinition
}
func (s *jsonURLScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error) {
scraper, err := s.getJsonScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
q := s.getJsonQuery(doc, u)
doc, err := s.loadURL(ctx, url)
if err != nil {
return nil, err
}
q := s.getJsonQuery(doc, url)
// if these just return the return values from scraper.scrape* functions then
// it ends up returning ScrapedContent(nil) rather than nil
switch ty {
@ -119,11 +107,15 @@ func (s *jsonScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCont
return nil, ErrNotSupported
}
func (s *jsonScraper) scrapeByName(ctx context.Context, name string, ty ScrapeContentType) ([]ScrapedContent, error) {
scraper := s.getJsonScraper()
type jsonNameScraper struct {
jsonScraper
definition ByNameDefinition
}
if scraper == nil {
return nil, fmt.Errorf("%w: name %v", ErrNotFound, s.scraper.Scraper)
func (s *jsonNameScraper) scrapeByName(ctx context.Context, name string, ty ScrapeContentType) ([]ScrapedContent, error) {
scraper, err := s.getJsonScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
const placeholder = "{}"
@ -131,7 +123,7 @@ func (s *jsonScraper) scrapeByName(ctx context.Context, name string, ty ScrapeCo
// replace the placeholder string with the URL-escaped name
escapedName := url.QueryEscape(name)
url := s.scraper.QueryURL
url := s.definition.QueryURL
url = strings.ReplaceAll(url, placeholder, escapedName)
doc, err := s.loadURL(ctx, url)
@ -172,18 +164,22 @@ func (s *jsonScraper) scrapeByName(ctx context.Context, name string, ty ScrapeCo
return nil, ErrNotSupported
}
func (s *jsonScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error) {
type jsonFragmentScraper struct {
jsonScraper
definition ByFragmentDefinition
}
func (s *jsonFragmentScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error) {
// construct the URL
queryURL := queryURLParametersFromScene(scene)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
if s.definition.QueryURLReplacements != nil {
queryURL.applyReplacements(s.definition.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
url := queryURL.constructURL(s.definition.QueryURL)
scraper := s.getJsonScraper()
if scraper == nil {
return nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
scraper, err := s.getJsonScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
doc, err := s.loadURL(ctx, url)
@ -196,7 +192,7 @@ func (s *jsonScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scen
return scraper.scrapeScene(ctx, q)
}
func (s *jsonScraper) scrapeByFragment(ctx context.Context, input Input) (ScrapedContent, error) {
func (s *jsonFragmentScraper) scrapeByFragment(ctx context.Context, input Input) (ScrapedContent, error) {
switch {
case input.Gallery != nil:
return nil, fmt.Errorf("%w: cannot use a json scraper as a gallery fragment scraper", ErrNotSupported)
@ -210,15 +206,14 @@ func (s *jsonScraper) scrapeByFragment(ctx context.Context, input Input) (Scrape
// construct the URL
queryURL := queryURLParametersFromScrapedScene(scene)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
if s.definition.QueryURLReplacements != nil {
queryURL.applyReplacements(s.definition.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
url := queryURL.constructURL(s.definition.QueryURL)
scraper := s.getJsonScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
scraper, err := s.getJsonScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
doc, err := s.loadURL(ctx, url)
@ -231,18 +226,17 @@ func (s *jsonScraper) scrapeByFragment(ctx context.Context, input Input) (Scrape
return scraper.scrapeScene(ctx, q)
}
func (s *jsonScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*models.ScrapedImage, error) {
func (s *jsonFragmentScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*models.ScrapedImage, error) {
// construct the URL
queryURL := queryURLParametersFromImage(image)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
if s.definition.QueryURLReplacements != nil {
queryURL.applyReplacements(s.definition.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
url := queryURL.constructURL(s.definition.QueryURL)
scraper := s.getJsonScraper()
if scraper == nil {
return nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
scraper, err := s.getJsonScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
doc, err := s.loadURL(ctx, url)
@ -255,18 +249,17 @@ func (s *jsonScraper) scrapeImageByImage(ctx context.Context, image *models.Imag
return scraper.scrapeImage(ctx, q)
}
func (s *jsonScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*models.ScrapedGallery, error) {
func (s *jsonFragmentScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*models.ScrapedGallery, error) {
// construct the URL
queryURL := queryURLParametersFromGallery(gallery)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
if s.definition.QueryURLReplacements != nil {
queryURL.applyReplacements(s.definition.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
url := queryURL.constructURL(s.definition.QueryURL)
scraper := s.getJsonScraper()
if scraper == nil {
return nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
scraper, err := s.getJsonScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
doc, err := s.loadURL(ctx, url)

View file

@ -68,7 +68,7 @@ jsonScrapers:
}
`
c := &config{}
c := &Definition{}
err := yaml.Unmarshal([]byte(yamlStr), &c)
if err != nil {

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,537 @@
package scraper
import (
"context"
"errors"
"net/url"
"strings"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/sliceutil"
"gopkg.in/yaml.v2"
)
type commonMappedConfig map[string]string
type mappedConfig map[string]mappedScraperAttrConfig
func (s mappedConfig) applyCommon(c commonMappedConfig, src string) string {
if c == nil {
return src
}
ret := src
for commonKey, commonVal := range c {
ret = strings.ReplaceAll(ret, commonKey, commonVal)
}
return ret
}
// extractHostname parses a URL string and returns the hostname.
// Returns empty string if the URL cannot be parsed.
func extractHostname(urlStr string) string {
if urlStr == "" {
return ""
}
u, err := url.Parse(urlStr)
if err != nil {
logger.Warnf("Error parsing URL '%s': %s", urlStr, err.Error())
return ""
}
return u.Hostname()
}
type isMultiFunc func(key string) bool
func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig, isMulti isMultiFunc) mappedResults {
var ret mappedResults
for k, attrConfig := range s {
if attrConfig.Fixed != "" {
// TODO - not sure if this needs to set _all_ indexes for the key
const i = 0
// Support {inputURL} and {inputHostname} placeholders in fixed values
value := strings.ReplaceAll(attrConfig.Fixed, "{inputURL}", q.getURL())
value = strings.ReplaceAll(value, "{inputHostname}", extractHostname(q.getURL()))
ret = ret.setSingleValue(i, k, value)
} else {
selector := attrConfig.Selector
selector = s.applyCommon(common, selector)
// Support {inputURL} and {inputHostname} placeholders in selectors
selector = strings.ReplaceAll(selector, "{inputURL}", q.getURL())
selector = strings.ReplaceAll(selector, "{inputHostname}", extractHostname(q.getURL()))
found, err := q.runQuery(selector)
if err != nil {
logger.Warnf("key '%v': %v", k, err)
}
if len(found) > 0 {
result := s.postProcess(ctx, q, attrConfig, found)
// HACK - if the key is URLs, then we need to set the value as a multi-value
isMulti := isMulti != nil && isMulti(k)
if isMulti {
ret = ret.setMultiValue(0, k, result)
} else {
for i, text := range result {
ret = ret.setSingleValue(i, k, text)
}
}
}
}
}
return ret
}
func (s mappedConfig) postProcess(ctx context.Context, q mappedQuery, attrConfig mappedScraperAttrConfig, found []string) []string {
// check if we're concatenating the results into a single result
var ret []string
if attrConfig.hasConcat() {
result := attrConfig.concatenateResults(found)
result = attrConfig.postProcess(ctx, result, q)
if attrConfig.hasSplit() {
results := attrConfig.splitString(result)
// skip cleaning when the query is used for searching
if q.getType() == SearchQuery {
return results
}
results = attrConfig.cleanResults(results)
return results
}
ret = []string{result}
} else {
for _, text := range found {
text = attrConfig.postProcess(ctx, text, q)
if attrConfig.hasSplit() {
return attrConfig.splitString(text)
}
ret = append(ret, text)
}
// skip cleaning when the query is used for searching
if q.getType() == SearchQuery {
return ret
}
ret = attrConfig.cleanResults(ret)
}
return ret
}
type mappedSceneScraperConfig struct {
mappedConfig
Tags mappedConfig `yaml:"Tags"`
Performers mappedPerformerScraperConfig `yaml:"Performers"`
Studio mappedConfig `yaml:"Studio"`
Movies mappedConfig `yaml:"Movies"`
Groups mappedConfig `yaml:"Groups"`
}
type _mappedSceneScraperConfig mappedSceneScraperConfig
const (
mappedScraperConfigSceneTags = "Tags"
mappedScraperConfigScenePerformers = "Performers"
mappedScraperConfigSceneStudio = "Studio"
mappedScraperConfigSceneMovies = "Movies"
mappedScraperConfigSceneGroups = "Groups"
)
func (s *mappedSceneScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
// HACK - unmarshal to map first, then remove known scene sub-fields, then
// remarshal to yaml and pass that down to the base map
parentMap := make(map[string]interface{})
if err := unmarshal(parentMap); err != nil {
return err
}
// move the known sub-fields to a separate map
thisMap := make(map[string]interface{})
thisMap[mappedScraperConfigSceneTags] = parentMap[mappedScraperConfigSceneTags]
thisMap[mappedScraperConfigScenePerformers] = parentMap[mappedScraperConfigScenePerformers]
thisMap[mappedScraperConfigSceneStudio] = parentMap[mappedScraperConfigSceneStudio]
thisMap[mappedScraperConfigSceneMovies] = parentMap[mappedScraperConfigSceneMovies]
thisMap[mappedScraperConfigSceneGroups] = parentMap[mappedScraperConfigSceneGroups]
delete(parentMap, mappedScraperConfigSceneTags)
delete(parentMap, mappedScraperConfigScenePerformers)
delete(parentMap, mappedScraperConfigSceneStudio)
delete(parentMap, mappedScraperConfigSceneMovies)
delete(parentMap, mappedScraperConfigSceneGroups)
// re-unmarshal the sub-fields
yml, err := yaml.Marshal(thisMap)
if err != nil {
return err
}
// needs to be a different type to prevent infinite recursion
c := _mappedSceneScraperConfig{}
if err := yaml.Unmarshal(yml, &c); err != nil {
return err
}
*s = mappedSceneScraperConfig(c)
yml, err = yaml.Marshal(parentMap)
if err != nil {
return err
}
if err := yaml.Unmarshal(yml, &s.mappedConfig); err != nil {
return err
}
return nil
}
type mappedGalleryScraperConfig struct {
mappedConfig
Tags mappedConfig `yaml:"Tags"`
Performers mappedConfig `yaml:"Performers"`
Studio mappedConfig `yaml:"Studio"`
}
type _mappedGalleryScraperConfig mappedGalleryScraperConfig
func (s *mappedGalleryScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
// HACK - unmarshal to map first, then remove known scene sub-fields, then
// remarshal to yaml and pass that down to the base map
parentMap := make(map[string]interface{})
if err := unmarshal(parentMap); err != nil {
return err
}
// move the known sub-fields to a separate map
thisMap := make(map[string]interface{})
thisMap[mappedScraperConfigSceneTags] = parentMap[mappedScraperConfigSceneTags]
thisMap[mappedScraperConfigScenePerformers] = parentMap[mappedScraperConfigScenePerformers]
thisMap[mappedScraperConfigSceneStudio] = parentMap[mappedScraperConfigSceneStudio]
delete(parentMap, mappedScraperConfigSceneTags)
delete(parentMap, mappedScraperConfigScenePerformers)
delete(parentMap, mappedScraperConfigSceneStudio)
// re-unmarshal the sub-fields
yml, err := yaml.Marshal(thisMap)
if err != nil {
return err
}
// needs to be a different type to prevent infinite recursion
c := _mappedGalleryScraperConfig{}
if err := yaml.Unmarshal(yml, &c); err != nil {
return err
}
*s = mappedGalleryScraperConfig(c)
yml, err = yaml.Marshal(parentMap)
if err != nil {
return err
}
if err := yaml.Unmarshal(yml, &s.mappedConfig); err != nil {
return err
}
return nil
}
type mappedImageScraperConfig struct {
mappedConfig
Tags mappedConfig `yaml:"Tags"`
Performers mappedConfig `yaml:"Performers"`
Studio mappedConfig `yaml:"Studio"`
}
type _mappedImageScraperConfig mappedImageScraperConfig
func (s *mappedImageScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
// HACK - unmarshal to map first, then remove known scene sub-fields, then
// remarshal to yaml and pass that down to the base map
parentMap := make(map[string]interface{})
if err := unmarshal(parentMap); err != nil {
return err
}
// move the known sub-fields to a separate map
thisMap := make(map[string]interface{})
thisMap[mappedScraperConfigSceneTags] = parentMap[mappedScraperConfigSceneTags]
thisMap[mappedScraperConfigScenePerformers] = parentMap[mappedScraperConfigScenePerformers]
thisMap[mappedScraperConfigSceneStudio] = parentMap[mappedScraperConfigSceneStudio]
delete(parentMap, mappedScraperConfigSceneTags)
delete(parentMap, mappedScraperConfigScenePerformers)
delete(parentMap, mappedScraperConfigSceneStudio)
// re-unmarshal the sub-fields
yml, err := yaml.Marshal(thisMap)
if err != nil {
return err
}
// needs to be a different type to prevent infinite recursion
c := _mappedImageScraperConfig{}
if err := yaml.Unmarshal(yml, &c); err != nil {
return err
}
*s = mappedImageScraperConfig(c)
yml, err = yaml.Marshal(parentMap)
if err != nil {
return err
}
if err := yaml.Unmarshal(yml, &s.mappedConfig); err != nil {
return err
}
return nil
}
type mappedPerformerScraperConfig struct {
mappedConfig
Tags mappedConfig `yaml:"Tags"`
}
type _mappedPerformerScraperConfig mappedPerformerScraperConfig
const (
mappedScraperConfigPerformerTags = "Tags"
)
func (s *mappedPerformerScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
// HACK - unmarshal to map first, then remove known scene sub-fields, then
// remarshal to yaml and pass that down to the base map
parentMap := make(map[string]interface{})
if err := unmarshal(parentMap); err != nil {
return err
}
// move the known sub-fields to a separate map
thisMap := make(map[string]interface{})
thisMap[mappedScraperConfigPerformerTags] = parentMap[mappedScraperConfigPerformerTags]
delete(parentMap, mappedScraperConfigPerformerTags)
// re-unmarshal the sub-fields
yml, err := yaml.Marshal(thisMap)
if err != nil {
return err
}
// needs to be a different type to prevent infinite recursion
c := _mappedPerformerScraperConfig{}
if err := yaml.Unmarshal(yml, &c); err != nil {
return err
}
*s = mappedPerformerScraperConfig(c)
yml, err = yaml.Marshal(parentMap)
if err != nil {
return err
}
if err := yaml.Unmarshal(yml, &s.mappedConfig); err != nil {
return err
}
return nil
}
type mappedMovieScraperConfig struct {
mappedConfig
Studio mappedConfig `yaml:"Studio"`
Tags mappedConfig `yaml:"Tags"`
}
type _mappedMovieScraperConfig mappedMovieScraperConfig
const (
mappedScraperConfigMovieStudio = "Studio"
mappedScraperConfigMovieTags = "Tags"
)
func (s *mappedMovieScraperConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
// HACK - unmarshal to map first, then remove known movie sub-fields, then
// remarshal to yaml and pass that down to the base map
parentMap := make(map[string]interface{})
if err := unmarshal(parentMap); err != nil {
return err
}
// move the known sub-fields to a separate map
thisMap := make(map[string]interface{})
thisMap[mappedScraperConfigMovieStudio] = parentMap[mappedScraperConfigMovieStudio]
delete(parentMap, mappedScraperConfigMovieStudio)
thisMap[mappedScraperConfigMovieTags] = parentMap[mappedScraperConfigMovieTags]
delete(parentMap, mappedScraperConfigMovieTags)
// re-unmarshal the sub-fields
yml, err := yaml.Marshal(thisMap)
if err != nil {
return err
}
// needs to be a different type to prevent infinite recursion
c := _mappedMovieScraperConfig{}
if err := yaml.Unmarshal(yml, &c); err != nil {
return err
}
*s = mappedMovieScraperConfig(c)
yml, err = yaml.Marshal(parentMap)
if err != nil {
return err
}
if err := yaml.Unmarshal(yml, &s.mappedConfig); err != nil {
return err
}
return nil
}
type mappedScraperAttrConfig struct {
Selector string `yaml:"selector"`
Fixed string `yaml:"fixed"`
PostProcess []mappedPostProcessAction `yaml:"postProcess"`
Concat string `yaml:"concat"`
Split string `yaml:"split"`
postProcessActions []postProcessAction
// Deprecated: use PostProcess instead
ParseDate string `yaml:"parseDate"`
Replace mappedRegexConfigs `yaml:"replace"`
SubScraper *mappedScraperAttrConfig `yaml:"subScraper"`
}
type _mappedScraperAttrConfig mappedScraperAttrConfig
func (c *mappedScraperAttrConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
// try unmarshalling into a string first
if err := unmarshal(&c.Selector); err != nil {
// if it's a type error then we try to unmarshall to the full object
var typeErr *yaml.TypeError
if !errors.As(err, &typeErr) {
return err
}
// unmarshall to full object
// need it as a separate object
t := _mappedScraperAttrConfig{}
if err = unmarshal(&t); err != nil {
return err
}
*c = mappedScraperAttrConfig(t)
}
return c.convertPostProcessActions()
}
func (c *mappedScraperAttrConfig) convertPostProcessActions() error {
// ensure we don't have the old deprecated fields and the new post process field
if len(c.PostProcess) > 0 {
if c.ParseDate != "" || len(c.Replace) > 0 || c.SubScraper != nil {
return errors.New("cannot include postProcess and (parseDate, replace, subScraper) deprecated fields")
}
// convert xpathPostProcessAction actions to postProcessActions
for _, a := range c.PostProcess {
action, err := a.ToPostProcessAction()
if err != nil {
return err
}
c.postProcessActions = append(c.postProcessActions, action)
}
c.PostProcess = nil
} else {
// convert old deprecated fields if present
// in same order as they used to be executed
if len(c.Replace) > 0 {
action := postProcessReplace(c.Replace)
c.postProcessActions = append(c.postProcessActions, &action)
c.Replace = nil
}
if c.SubScraper != nil {
action := postProcessSubScraper(*c.SubScraper)
c.postProcessActions = append(c.postProcessActions, &action)
c.SubScraper = nil
}
if c.ParseDate != "" {
action := postProcessParseDate(c.ParseDate)
c.postProcessActions = append(c.postProcessActions, &action)
c.ParseDate = ""
}
}
return nil
}
func (c mappedScraperAttrConfig) hasConcat() bool {
return c.Concat != ""
}
func (c mappedScraperAttrConfig) hasSplit() bool {
return c.Split != ""
}
func (c mappedScraperAttrConfig) concatenateResults(nodes []string) string {
separator := c.Concat
return strings.Join(nodes, separator)
}
func (c mappedScraperAttrConfig) cleanResults(nodes []string) []string {
cleaned := sliceutil.Unique(nodes) // remove duplicate values
cleaned = sliceutil.Delete(cleaned, "") // remove empty values
return cleaned
}
func (c mappedScraperAttrConfig) splitString(value string) []string {
separator := c.Split
var res []string
if separator == "" {
return []string{value}
}
for _, str := range strings.Split(value, separator) {
if str != "" {
res = append(res, str)
}
}
return res
}
func (c mappedScraperAttrConfig) postProcess(ctx context.Context, value string, q mappedQuery) string {
for _, action := range c.postProcessActions {
value = action.Apply(ctx, value, q)
}
return value
}

View file

@ -0,0 +1,333 @@
package scraper
import (
"context"
"errors"
"fmt"
"math"
"regexp"
"strconv"
"strings"
"time"
"github.com/stashapp/stash/pkg/javascript"
"github.com/stashapp/stash/pkg/logger"
)
type mappedRegexConfig struct {
Regex string `yaml:"regex"`
With string `yaml:"with"`
}
type mappedRegexConfigs []mappedRegexConfig
func (c mappedRegexConfig) apply(value string) string {
if c.Regex != "" {
re, err := regexp.Compile(c.Regex)
if err != nil {
logger.Warnf("Error compiling regex '%s': %s", c.Regex, err.Error())
return value
}
ret := re.ReplaceAllString(value, c.With)
// trim leading and trailing whitespace
// this is done to maintain backwards compatibility with existing
// scrapers
ret = strings.TrimSpace(ret)
logger.Debugf(`Replace: '%s' with '%s'`, c.Regex, c.With)
logger.Debugf("Before: %s", value)
logger.Debugf("After: %s", ret)
return ret
}
return value
}
func (c mappedRegexConfigs) apply(value string) string {
// apply regex in order
for _, config := range c {
value = config.apply(value)
}
return value
}
type postProcessAction interface {
Apply(ctx context.Context, value string, q mappedQuery) string
}
type postProcessParseDate string
func (p *postProcessParseDate) Apply(ctx context.Context, value string, q mappedQuery) string {
parseDate := string(*p)
const internalDateFormat = "2006-01-02"
valueLower := strings.ToLower(value)
if valueLower == "today" || valueLower == "yesterday" { // handle today, yesterday
dt := time.Now()
if valueLower == "yesterday" { // subtract 1 day from now
dt = dt.AddDate(0, 0, -1)
}
return dt.Format(internalDateFormat)
}
if parseDate == "" {
return value
}
if parseDate == "unix" {
// try to parse the date using unix timestamp format
// if it fails, then just fall back to the original value
timeAsInt, err := strconv.ParseInt(value, 10, 64)
if err != nil {
logger.Warnf("Error parsing date string '%s' using unix timestamp format : %s", value, err.Error())
return value
}
parsedValue := time.Unix(timeAsInt, 0)
return parsedValue.Format(internalDateFormat)
}
// try to parse the date using the pattern
// if it fails, then just fall back to the original value
parsedValue, err := time.Parse(parseDate, value)
if err != nil {
logger.Warnf("Error parsing date string '%s' using format '%s': %s", value, parseDate, err.Error())
return value
}
// convert it into our date format
return parsedValue.Format(internalDateFormat)
}
type postProcessSubtractDays bool
func (p *postProcessSubtractDays) Apply(ctx context.Context, value string, q mappedQuery) string {
const internalDateFormat = "2006-01-02"
i, err := strconv.Atoi(value)
if err != nil {
logger.Warnf("Error parsing day string %s: %s", value, err)
return value
}
dt := time.Now()
dt = dt.AddDate(0, 0, -i)
return dt.Format(internalDateFormat)
}
type postProcessReplace mappedRegexConfigs
func (c *postProcessReplace) Apply(ctx context.Context, value string, q mappedQuery) string {
replace := mappedRegexConfigs(*c)
return replace.apply(value)
}
type postProcessSubScraper mappedScraperAttrConfig
func (p *postProcessSubScraper) Apply(ctx context.Context, value string, q mappedQuery) string {
subScrapeConfig := mappedScraperAttrConfig(*p)
logger.Debugf("Sub-scraping for: %s", value)
ss := q.subScrape(ctx, value)
if ss != nil {
found, err := ss.runQuery(subScrapeConfig.Selector)
if err != nil {
logger.Warnf("subscrape for '%v': %v", value, err)
}
if len(found) > 0 {
// check if we're concatenating the results into a single result
var result string
if subScrapeConfig.hasConcat() {
result = subScrapeConfig.concatenateResults(found)
} else {
result = found[0]
}
result = subScrapeConfig.postProcess(ctx, result, ss)
return result
}
}
return ""
}
type postProcessMap map[string]string
func (p *postProcessMap) Apply(ctx context.Context, value string, q mappedQuery) string {
// return the mapped value if present
m := *p
mapped, ok := m[value]
if ok {
return mapped
}
return value
}
type postProcessFeetToCm bool
func (p *postProcessFeetToCm) Apply(ctx context.Context, value string, q mappedQuery) string {
const foot_in_cm = 30.48
const inch_in_cm = 2.54
reg := regexp.MustCompile("[0-9]+")
filtered := reg.FindAllString(value, -1)
var feet float64
var inches float64
if len(filtered) > 0 {
feet, _ = strconv.ParseFloat(filtered[0], 64)
}
if len(filtered) > 1 {
inches, _ = strconv.ParseFloat(filtered[1], 64)
}
var centimeters = feet*foot_in_cm + inches*inch_in_cm
// Return rounded integer string
return strconv.Itoa(int(math.Round(centimeters)))
}
type postProcessLbToKg bool
func (p *postProcessLbToKg) Apply(ctx context.Context, value string, q mappedQuery) string {
const lb_in_kg = 0.45359237
w, err := strconv.ParseFloat(value, 64)
if err == nil {
w *= lb_in_kg
value = strconv.Itoa(int(math.Round(w)))
}
return value
}
type postProcessJavascript string
func (p *postProcessJavascript) Apply(ctx context.Context, value string, q mappedQuery) string {
vm := javascript.NewVM()
if err := vm.Set("value", value); err != nil {
logger.Warnf("javascript failed to set value: %v", err)
return value
}
log := &javascript.Log{
Logger: logger.Logger,
Prefix: "",
ProgressChan: make(chan float64),
}
if err := log.AddToVM("log", vm); err != nil {
logger.Logger.Errorf("error adding log API: %w", err)
}
util := &javascript.Util{}
if err := util.AddToVM("util", vm); err != nil {
logger.Logger.Errorf("error adding util API: %w", err)
}
script, err := javascript.CompileScript("", "(function() { "+string(*p)+"})()")
if err != nil {
logger.Warnf("javascript failed to compile: %v", err)
return value
}
output, err := vm.RunProgram(script)
if err != nil {
logger.Warnf("javascript failed to run: %v", err)
return value
}
// assume output is string
return output.String()
}
type mappedPostProcessAction struct {
ParseDate string `yaml:"parseDate"`
SubtractDays bool `yaml:"subtractDays"`
Replace mappedRegexConfigs `yaml:"replace"`
SubScraper *mappedScraperAttrConfig `yaml:"subScraper"`
Map map[string]string `yaml:"map"`
FeetToCm bool `yaml:"feetToCm"`
LbToKg bool `yaml:"lbToKg"`
Javascript string `yaml:"javascript"`
}
func (a mappedPostProcessAction) ToPostProcessAction() (postProcessAction, error) {
var found string
var ret postProcessAction
ensureOnly := func(field string) error {
if found != "" {
return fmt.Errorf("post-process actions must have a single field, found %s and %s", found, field)
}
found = field
return nil
}
if a.ParseDate != "" {
found = "parseDate"
action := postProcessParseDate(a.ParseDate)
ret = &action
}
if len(a.Replace) > 0 {
if err := ensureOnly("replace"); err != nil {
return nil, err
}
action := postProcessReplace(a.Replace)
ret = &action
}
if a.SubScraper != nil {
if err := ensureOnly("subScraper"); err != nil {
return nil, err
}
action := postProcessSubScraper(*a.SubScraper)
ret = &action
}
if a.Map != nil {
if err := ensureOnly("map"); err != nil {
return nil, err
}
action := postProcessMap(a.Map)
ret = &action
}
if a.FeetToCm {
if err := ensureOnly("feetToCm"); err != nil {
return nil, err
}
action := postProcessFeetToCm(a.FeetToCm)
ret = &action
}
if a.LbToKg {
if err := ensureOnly("lbToKg"); err != nil {
return nil, err
}
action := postProcessLbToKg(a.LbToKg)
ret = &action
}
if a.SubtractDays {
if err := ensureOnly("subtractDays"); err != nil {
return nil, err
}
action := postProcessSubtractDays(a.SubtractDays)
ret = &action
}
if a.Javascript != "" {
if err := ensureOnly("javascript"); err != nil {
return nil, err
}
action := postProcessJavascript(a.Javascript)
ret = &action
}
if ret == nil {
return nil, errors.New("invalid post-process action")
}
return ret, nil
}

View file

@ -0,0 +1,276 @@
package scraper
import (
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
)
type mappedResult map[string]interface{}
type mappedResults []mappedResult
func (r mappedResult) string(key string) (string, bool) {
v, ok := r[key]
if !ok {
return "", false
}
val, ok := v.(string)
if !ok {
logger.Errorf("String field %s is %T in mappedResult", key, r[key])
}
return val, true
}
func (r mappedResult) mustString(key string) string {
v, ok := r[key]
if !ok {
logger.Errorf("Missing required string field %s in mappedResult", key)
return ""
}
val, ok := v.(string)
if !ok {
logger.Errorf("String field %s is %T in mappedResult", key, r[key])
}
return val
}
func (r mappedResult) stringPtr(key string) *string {
val, ok := r.string(key)
if !ok {
return nil
}
return &val
}
func (r mappedResult) stringSlice(key string) []string {
v, ok := r[key]
if !ok {
return nil
}
// need to try both []string and string
val, ok := v.([]string)
if ok {
return val
}
// try single string
singleVal, ok := v.(string)
if !ok {
logger.Errorf("String slice field %s is %T in mappedResult", key, r[key])
return nil
}
return []string{singleVal}
}
func (r mappedResult) IntPtr(key string) *int {
v, ok := r[key]
if !ok {
return nil
}
val, ok := v.(int)
if !ok {
logger.Errorf("Int field %s is %T in mappedResult", key, r[key])
return nil
}
return &val
}
func (r mappedResults) setSingleValue(index int, key string, value string) mappedResults {
if index >= len(r) {
r = append(r, make(mappedResult))
}
logger.Debugf(`[%d][%s] = %s`, index, key, value)
r[index][key] = value
return r
}
func (r mappedResults) setMultiValue(index int, key string, value []string) mappedResults {
if index >= len(r) {
r = append(r, make(mappedResult))
}
logger.Debugf(`[%d][%s] = %s`, index, key, value)
r[index][key] = value
return r
}
func (r mappedResults) scrapedTags() []*models.ScrapedTag {
if len(r) == 0 {
return nil
}
ret := make([]*models.ScrapedTag, len(r))
for i, result := range r {
ret[i] = result.scrapedTag()
}
return ret
}
func (r mappedResult) scrapedTag() *models.ScrapedTag {
return &models.ScrapedTag{
Name: r.mustString("Name"),
}
}
func (r mappedResult) scrapedPerformer() *models.ScrapedPerformer {
ret := &models.ScrapedPerformer{
Name: r.stringPtr("Name"),
Disambiguation: r.stringPtr("Disambiguation"),
Gender: r.stringPtr("Gender"),
URL: r.stringPtr("URL"),
URLs: r.stringSlice("URLs"),
Twitter: r.stringPtr("Twitter"),
Birthdate: r.stringPtr("Birthdate"),
Ethnicity: r.stringPtr("Ethnicity"),
Country: r.stringPtr("Country"),
EyeColor: r.stringPtr("EyeColor"),
Height: r.stringPtr("Height"),
Measurements: r.stringPtr("Measurements"),
FakeTits: r.stringPtr("FakeTits"),
PenisLength: r.stringPtr("PenisLength"),
Circumcised: r.stringPtr("Circumcised"),
CareerLength: r.stringPtr("CareerLength"),
Tattoos: r.stringPtr("Tattoos"),
Piercings: r.stringPtr("Piercings"),
Aliases: r.stringPtr("Aliases"),
Image: r.stringPtr("Image"),
Images: r.stringSlice("Images"),
Details: r.stringPtr("Details"),
DeathDate: r.stringPtr("DeathDate"),
HairColor: r.stringPtr("HairColor"),
Weight: r.stringPtr("Weight"),
}
return ret
}
func (r mappedResults) scrapedPerformers() []*models.ScrapedPerformer {
if len(r) == 0 {
return nil
}
ret := make([]*models.ScrapedPerformer, len(r))
for i, result := range r {
ret[i] = result.scrapedPerformer()
}
return ret
}
func (r mappedResult) scrapedScene() *models.ScrapedScene {
ret := &models.ScrapedScene{
Title: r.stringPtr("Title"),
Code: r.stringPtr("Code"),
Details: r.stringPtr("Details"),
Director: r.stringPtr("Director"),
URL: r.stringPtr("URL"),
URLs: r.stringSlice("URLs"),
Date: r.stringPtr("Date"),
Image: r.stringPtr("Image"),
Duration: r.IntPtr("Duration"),
}
return ret
}
func (r mappedResult) scrapedImage() *models.ScrapedImage {
ret := &models.ScrapedImage{
Title: r.stringPtr("Title"),
Code: r.stringPtr("Code"),
Details: r.stringPtr("Details"),
Photographer: r.stringPtr("Photographer"),
URLs: r.stringSlice("URLs"),
Date: r.stringPtr("Date"),
}
return ret
}
func (r mappedResult) scrapedGallery() *models.ScrapedGallery {
ret := &models.ScrapedGallery{
Title: r.stringPtr("Title"),
Code: r.stringPtr("Code"),
Details: r.stringPtr("Details"),
Photographer: r.stringPtr("Photographer"),
URL: r.stringPtr("URL"),
URLs: r.stringSlice("URLs"),
Date: r.stringPtr("Date"),
}
return ret
}
func (r mappedResult) scrapedStudio() *models.ScrapedStudio {
ret := &models.ScrapedStudio{
Name: r.mustString("Name"),
URL: r.stringPtr("URL"),
URLs: r.stringSlice("URLs"),
Image: r.stringPtr("Image"),
Details: r.stringPtr("Details"),
Aliases: r.stringPtr("Aliases"),
}
return ret
}
func (r mappedResult) scrapedMovie() *models.ScrapedMovie {
ret := &models.ScrapedMovie{
Name: r.stringPtr("Name"),
Aliases: r.stringPtr("Aliases"),
URLs: r.stringSlice("URLs"),
Duration: r.stringPtr("Duration"),
Date: r.stringPtr("Date"),
Director: r.stringPtr("Director"),
Synopsis: r.stringPtr("Synopsis"),
FrontImage: r.stringPtr("FrontImage"),
BackImage: r.stringPtr("BackImage"),
}
return ret
}
func (r mappedResult) scrapedGroup() *models.ScrapedGroup {
ret := &models.ScrapedGroup{
Name: r.stringPtr("Name"),
Aliases: r.stringPtr("Aliases"),
URL: r.stringPtr("URL"),
URLs: r.stringSlice("URLs"),
Duration: r.stringPtr("Duration"),
Date: r.stringPtr("Date"),
Director: r.stringPtr("Director"),
Synopsis: r.stringPtr("Synopsis"),
FrontImage: r.stringPtr("FrontImage"),
BackImage: r.stringPtr("BackImage"),
}
return ret
}
func (r mappedResults) scrapedMovies() []*models.ScrapedMovie {
if len(r) == 0 {
return nil
}
ret := make([]*models.ScrapedMovie, len(r))
for i, result := range r {
ret[i] = result.scrapedMovie()
}
return ret
}
func (r mappedResults) scrapedGroups() []*models.ScrapedGroup {
if len(r) == 0 {
return nil
}
ret := make([]*models.ScrapedGroup, len(r))
for i, result := range r {
ret[i] = result.scrapedGroup()
}
return ret
}

View file

@ -0,0 +1,908 @@
package scraper
import (
"testing"
"github.com/stashapp/stash/pkg/models"
"github.com/stretchr/testify/assert"
)
// Test string method
func TestMappedResultString(t *testing.T) {
tests := []struct {
name string
data mappedResult
key string
expectedValue string
expectedOk bool
}{
{
name: "valid string",
data: mappedResult{"name": "test"},
key: "name",
expectedValue: "test",
expectedOk: true,
},
{
name: "missing key",
data: mappedResult{},
key: "missing",
expectedValue: "",
expectedOk: false,
},
{
name: "wrong type still returns ok true but empty value",
data: mappedResult{"num": 123},
key: "num",
expectedValue: "",
expectedOk: true, // logs error but returns ok=true
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
val, ok := test.data.string(test.key)
assert.Equal(t, test.expectedValue, val)
assert.Equal(t, test.expectedOk, ok)
})
}
}
// Test mustString method
func TestMappedResultMustString(t *testing.T) {
tests := []struct {
name string
data mappedResult
key string
expectedValue string
}{
{
name: "valid string",
data: mappedResult{"name": "test"},
key: "name",
expectedValue: "test",
},
{
name: "missing key returns empty string",
data: mappedResult{},
key: "missing",
expectedValue: "",
},
{
name: "wrong type returns empty string",
data: mappedResult{"num": 123},
key: "num",
expectedValue: "",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
val := test.data.mustString(test.key)
assert.Equal(t, test.expectedValue, val)
})
}
}
// Test stringPtr method
func TestMappedResultStringPtr(t *testing.T) {
tests := []struct {
name string
data mappedResult
key string
expectedValue *string
}{
{
name: "valid string",
data: mappedResult{"name": "test"},
key: "name",
expectedValue: strPtr("test"),
},
{
name: "missing key returns nil",
data: mappedResult{},
key: "missing",
expectedValue: nil,
},
{
name: "wrong type returns non-nil pointer to empty string",
data: mappedResult{"num": 123},
key: "num",
expectedValue: strPtr(""), // string() returns empty string but ok=true
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
val := test.data.stringPtr(test.key)
if test.expectedValue == nil {
assert.Nil(t, val)
} else {
assert.NotNil(t, val)
assert.Equal(t, *test.expectedValue, *val)
}
})
}
}
// Test stringSlice method
func TestMappedResultStringSlice(t *testing.T) {
tests := []struct {
name string
data mappedResult
key string
expectedValue []string
}{
{
name: "valid slice",
data: mappedResult{"tags": []string{"a", "b", "c"}},
key: "tags",
expectedValue: []string{"a", "b", "c"},
},
{
name: "missing key returns nil",
data: mappedResult{},
key: "missing",
expectedValue: nil,
},
{
name: "single value converted to slice",
data: mappedResult{"tags": "not a slice"},
key: "tags",
expectedValue: []string{"not a slice"},
},
{
name: "wrong type returns nil",
data: mappedResult{"tags": 123},
key: "tags",
expectedValue: nil,
},
{
name: "empty slice",
data: mappedResult{"tags": []string{}},
key: "tags",
expectedValue: []string{},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
val := test.data.stringSlice(test.key)
assert.Equal(t, test.expectedValue, val)
})
}
}
// Test IntPtr method
func TestMappedResultIntPtr(t *testing.T) {
tests := []struct {
name string
data mappedResult
key string
expectedValue *int
}{
{
name: "valid int",
data: mappedResult{"duration": 120},
key: "duration",
expectedValue: intPtr(120),
},
{
name: "missing key returns nil",
data: mappedResult{},
key: "missing",
expectedValue: nil,
},
{
name: "wrong type returns nil",
data: mappedResult{"duration": "120"},
key: "duration",
expectedValue: nil,
},
{
name: "zero value",
data: mappedResult{"duration": 0},
key: "duration",
expectedValue: intPtr(0),
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
val := test.data.IntPtr(test.key)
assert.Equal(t, test.expectedValue, val)
})
}
}
// Test setSingleValue method
func TestMappedResultsSetSingleValue(t *testing.T) {
tests := []struct {
name string
initialResults mappedResults
index int
key string
value string
expectedLen int
shouldPanic bool
}{
{
name: "append to empty",
initialResults: mappedResults{},
index: 0,
key: "name",
value: "test",
expectedLen: 1,
shouldPanic: false,
},
{
name: "set in existing",
initialResults: mappedResults{mappedResult{}},
index: 0,
key: "name",
value: "test",
expectedLen: 1,
shouldPanic: false,
},
{
name: "append to existing",
initialResults: mappedResults{mappedResult{}},
index: 1,
key: "name",
value: "test",
expectedLen: 2,
shouldPanic: false,
},
{
name: "sparse index causes panic",
initialResults: mappedResults{mappedResult{}},
index: 5,
key: "name",
value: "test",
expectedLen: 6,
shouldPanic: true,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
if test.shouldPanic {
assert.Panics(t, func() {
test.initialResults.setSingleValue(test.index, test.key, test.value)
})
} else {
results := test.initialResults.setSingleValue(test.index, test.key, test.value)
assert.Equal(t, test.expectedLen, len(results))
assert.Equal(t, test.value, results[test.index][test.key])
}
})
}
}
// Test setMultiValue method
func TestMappedResultsSetMultiValue(t *testing.T) {
tests := []struct {
name string
initialResults mappedResults
index int
key string
value []string
expectedLen int
}{
{
name: "append to empty",
initialResults: mappedResults{},
index: 0,
key: "tags",
value: []string{"a", "b"},
expectedLen: 1,
},
{
name: "set in existing",
initialResults: mappedResults{mappedResult{}},
index: 0,
key: "tags",
value: []string{"a", "b"},
expectedLen: 1,
},
{
name: "append to existing",
initialResults: mappedResults{mappedResult{}},
index: 1,
key: "tags",
value: []string{"x", "y"},
expectedLen: 2,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
results := test.initialResults.setMultiValue(test.index, test.key, test.value)
assert.Equal(t, test.expectedLen, len(results))
assert.Equal(t, test.value, results[test.index][test.key])
})
}
}
// Test scrapedTag method
func TestMappedResultScrapedTag(t *testing.T) {
tests := []struct {
name string
data mappedResult
expectedName string
}{
{
name: "valid tag",
data: mappedResult{"Name": "Action"},
expectedName: "Action",
},
{
name: "missing name",
data: mappedResult{},
expectedName: "",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
tag := test.data.scrapedTag()
assert.NotNil(t, tag)
assert.Equal(t, test.expectedName, tag.Name)
})
}
}
// Test scrapedTags method
func TestMappedResultsScrapedTags(t *testing.T) {
tests := []struct {
name string
data mappedResults
expectedCount int
expectedNames []string
}{
{
name: "empty results",
data: mappedResults{},
expectedCount: 0,
},
{
name: "single tag",
data: mappedResults{
mappedResult{"Name": "Action"},
},
expectedCount: 1,
expectedNames: []string{"Action"},
},
{
name: "multiple tags",
data: mappedResults{
mappedResult{"Name": "Action"},
mappedResult{"Name": "Drama"},
mappedResult{"Name": "Comedy"},
},
expectedCount: 3,
expectedNames: []string{"Action", "Drama", "Comedy"},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
tags := test.data.scrapedTags()
if test.expectedCount == 0 {
assert.Nil(t, tags)
} else {
assert.NotNil(t, tags)
assert.Equal(t, test.expectedCount, len(tags))
for i, expectedName := range test.expectedNames {
assert.Equal(t, expectedName, tags[i].Name)
}
}
})
}
}
// Test scrapedPerformer method
func TestMappedResultScrapedPerformer(t *testing.T) {
tests := []struct {
name string
data mappedResult
validate func(t *testing.T, p *models.ScrapedPerformer)
}{
{
name: "full performer",
data: mappedResult{
"Name": "Jane Doe",
"Disambiguation": "Actress",
"Gender": "Female",
"URL": "https://example.com/jane",
"URLs": []string{"url1", "url2"},
"Twitter": "@jane",
"Birthdate": "1990-01-01",
"Ethnicity": "Caucasian",
"Country": "USA",
"EyeColor": "Blue",
"Height": "5'6\"",
"Measurements": "36-24-36",
"FakeTits": "No",
"PenisLength": "N/A",
"Circumcised": "N/A",
"CareerLength": "10 years",
"Tattoos": "Yes",
"Piercings": "Yes",
"Aliases": "Jane Smith",
"Image": "image.jpg",
"Images": []string{"img1", "img2"},
"Details": "Some details",
"DeathDate": "N/A",
"HairColor": "Blonde",
"Weight": "130 lbs",
},
validate: func(t *testing.T, p *models.ScrapedPerformer) {
assert.NotNil(t, p)
assert.Equal(t, "Jane Doe", *p.Name)
assert.Equal(t, "Actress", *p.Disambiguation)
assert.Equal(t, "Female", *p.Gender)
assert.Equal(t, "https://example.com/jane", *p.URL)
assert.Equal(t, []string{"url1", "url2"}, p.URLs)
assert.Equal(t, "@jane", *p.Twitter)
assert.Equal(t, "Blonde", *p.HairColor)
assert.Equal(t, "130 lbs", *p.Weight)
},
},
{
name: "minimal performer",
data: mappedResult{},
validate: func(t *testing.T, p *models.ScrapedPerformer) {
assert.NotNil(t, p)
assert.Nil(t, p.Name)
assert.Nil(t, p.Gender)
assert.Empty(t, p.URLs)
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
performer := test.data.scrapedPerformer()
test.validate(t, performer)
})
}
}
// Test scrapedPerformers method
func TestMappedResultsScrapedPerformers(t *testing.T) {
tests := []struct {
name string
data mappedResults
expectedCount int
}{
{
name: "empty results",
data: mappedResults{},
expectedCount: 0,
},
{
name: "single performer",
data: mappedResults{
mappedResult{"Name": "Jane Doe"},
},
expectedCount: 1,
},
{
name: "multiple performers",
data: mappedResults{
mappedResult{"Name": "Jane Doe"},
mappedResult{"Name": "John Doe"},
mappedResult{"Name": "Alice"},
},
expectedCount: 3,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
performers := test.data.scrapedPerformers()
if test.expectedCount == 0 {
assert.Nil(t, performers)
} else {
assert.NotNil(t, performers)
assert.Equal(t, test.expectedCount, len(performers))
}
})
}
}
// Test scrapedScene method
func TestMappedResultScrapedScene(t *testing.T) {
tests := []struct {
name string
data mappedResult
validate func(t *testing.T, s *models.ScrapedScene)
}{
{
name: "full scene",
data: mappedResult{
"Title": "Scene Title",
"Code": "CODE123",
"Details": "Scene details",
"Director": "John Smith",
"URL": "https://example.com/scene",
"URLs": []string{"url1", "url2"},
"Date": "2020-01-01",
"Image": "scene.jpg",
"Duration": 3600,
},
validate: func(t *testing.T, s *models.ScrapedScene) {
assert.NotNil(t, s)
assert.Equal(t, "Scene Title", *s.Title)
assert.Equal(t, "CODE123", *s.Code)
assert.Equal(t, "Scene details", *s.Details)
assert.Equal(t, "John Smith", *s.Director)
assert.Equal(t, "https://example.com/scene", *s.URL)
assert.Equal(t, []string{"url1", "url2"}, s.URLs)
assert.Equal(t, "2020-01-01", *s.Date)
assert.Equal(t, "scene.jpg", *s.Image)
assert.Equal(t, 3600, *s.Duration)
},
},
{
name: "minimal scene",
data: mappedResult{},
validate: func(t *testing.T, s *models.ScrapedScene) {
assert.NotNil(t, s)
assert.Nil(t, s.Title)
assert.Nil(t, s.Duration)
assert.Empty(t, s.URLs)
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
scene := test.data.scrapedScene()
test.validate(t, scene)
})
}
}
// Test scrapedImage method
func TestMappedResultScrapedImage(t *testing.T) {
tests := []struct {
name string
data mappedResult
validate func(t *testing.T, i *models.ScrapedImage)
}{
{
name: "full image",
data: mappedResult{
"Title": "Image Title",
"Code": "IMG123",
"Details": "Image details",
"Photographer": "Jane Photographer",
"URLs": []string{"url1", "url2"},
"Date": "2020-06-15",
},
validate: func(t *testing.T, i *models.ScrapedImage) {
assert.NotNil(t, i)
assert.Equal(t, "Image Title", *i.Title)
assert.Equal(t, "IMG123", *i.Code)
assert.Equal(t, "Image details", *i.Details)
assert.Equal(t, "Jane Photographer", *i.Photographer)
assert.Equal(t, []string{"url1", "url2"}, i.URLs)
assert.Equal(t, "2020-06-15", *i.Date)
},
},
{
name: "minimal image",
data: mappedResult{},
validate: func(t *testing.T, i *models.ScrapedImage) {
assert.NotNil(t, i)
assert.Nil(t, i.Title)
assert.Empty(t, i.URLs)
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
image := test.data.scrapedImage()
test.validate(t, image)
})
}
}
// Test scrapedGallery method
func TestMappedResultScrapedGallery(t *testing.T) {
tests := []struct {
name string
data mappedResult
validate func(t *testing.T, g *models.ScrapedGallery)
}{
{
name: "full gallery",
data: mappedResult{
"Title": "Gallery Title",
"Code": "GAL123",
"Details": "Gallery details",
"Photographer": "Jane Photographer",
"URL": "https://example.com/gallery",
"URLs": []string{"url1", "url2"},
"Date": "2020-07-20",
},
validate: func(t *testing.T, g *models.ScrapedGallery) {
assert.NotNil(t, g)
assert.Equal(t, "Gallery Title", *g.Title)
assert.Equal(t, "GAL123", *g.Code)
assert.Equal(t, "Gallery details", *g.Details)
assert.Equal(t, "Jane Photographer", *g.Photographer)
assert.Equal(t, "https://example.com/gallery", *g.URL)
assert.Equal(t, []string{"url1", "url2"}, g.URLs)
assert.Equal(t, "2020-07-20", *g.Date)
},
},
{
name: "minimal gallery",
data: mappedResult{},
validate: func(t *testing.T, g *models.ScrapedGallery) {
assert.NotNil(t, g)
assert.Nil(t, g.Title)
assert.Empty(t, g.URLs)
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
gallery := test.data.scrapedGallery()
test.validate(t, gallery)
})
}
}
// Test scrapedStudio method
func TestMappedResultScrapedStudio(t *testing.T) {
tests := []struct {
name string
data mappedResult
validate func(t *testing.T, st *models.ScrapedStudio)
}{
{
name: "full studio",
data: mappedResult{
"Name": "Studio Name",
"URL": "https://example.com/studio",
"URLs": []string{"url1", "url2"},
"Image": "studio.jpg",
"Details": "Studio details",
"Aliases": "Studio Alias",
},
validate: func(t *testing.T, st *models.ScrapedStudio) {
assert.NotNil(t, st)
assert.Equal(t, "Studio Name", st.Name)
assert.Equal(t, "https://example.com/studio", *st.URL)
assert.Equal(t, []string{"url1", "url2"}, st.URLs)
assert.Equal(t, "studio.jpg", *st.Image)
assert.Equal(t, "Studio details", *st.Details)
assert.Equal(t, "Studio Alias", *st.Aliases)
},
},
{
name: "minimal studio",
data: mappedResult{},
validate: func(t *testing.T, st *models.ScrapedStudio) {
assert.NotNil(t, st)
assert.Equal(t, "", st.Name) // mustString returns empty string
assert.Nil(t, st.URL)
assert.Empty(t, st.URLs)
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
studio := test.data.scrapedStudio()
test.validate(t, studio)
})
}
}
// Test scrapedMovie method
func TestMappedResultScrapedMovie(t *testing.T) {
tests := []struct {
name string
data mappedResult
validate func(t *testing.T, m *models.ScrapedMovie)
}{
{
name: "full movie",
data: mappedResult{
"Name": "Movie Title",
"Aliases": "Movie Alias",
"URLs": []string{"url1", "url2"},
"Duration": "120 minutes",
"Date": "2020-05-10",
"Director": "John Director",
"Synopsis": "Movie synopsis",
"FrontImage": "front.jpg",
"BackImage": "back.jpg",
},
validate: func(t *testing.T, m *models.ScrapedMovie) {
assert.NotNil(t, m)
assert.Equal(t, "Movie Title", *m.Name)
assert.Equal(t, "Movie Alias", *m.Aliases)
assert.Equal(t, []string{"url1", "url2"}, m.URLs)
assert.Equal(t, "120 minutes", *m.Duration)
assert.Equal(t, "2020-05-10", *m.Date)
assert.Equal(t, "John Director", *m.Director)
assert.Equal(t, "Movie synopsis", *m.Synopsis)
assert.Equal(t, "front.jpg", *m.FrontImage)
assert.Equal(t, "back.jpg", *m.BackImage)
},
},
{
name: "minimal movie",
data: mappedResult{},
validate: func(t *testing.T, m *models.ScrapedMovie) {
assert.NotNil(t, m)
assert.Nil(t, m.Name)
assert.Empty(t, m.URLs)
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
movie := test.data.scrapedMovie()
test.validate(t, movie)
})
}
}
// Test scrapedMovies method
func TestMappedResultsScrapedMovies(t *testing.T) {
tests := []struct {
name string
data mappedResults
expectedCount int
}{
{
name: "empty results",
data: mappedResults{},
expectedCount: 0,
},
{
name: "single movie",
data: mappedResults{
mappedResult{"Name": "Movie 1"},
},
expectedCount: 1,
},
{
name: "multiple movies",
data: mappedResults{
mappedResult{"Name": "Movie 1"},
mappedResult{"Name": "Movie 2"},
mappedResult{"Name": "Movie 3"},
},
expectedCount: 3,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
movies := test.data.scrapedMovies()
if test.expectedCount == 0 {
assert.Nil(t, movies)
} else {
assert.NotNil(t, movies)
assert.Equal(t, test.expectedCount, len(movies))
}
})
}
}
// Test scrapedGroup method
func TestMappedResultScrapedGroup(t *testing.T) {
tests := []struct {
name string
data mappedResult
validate func(t *testing.T, g *models.ScrapedGroup)
}{
{
name: "full group",
data: mappedResult{
"Name": "Group Title",
"Aliases": "Group Alias",
"URL": "https://example.com/group",
"URLs": []string{"url1", "url2"},
"Duration": "240 minutes",
"Date": "2020-08-15",
"Director": "Jane Director",
"Synopsis": "Group synopsis",
"FrontImage": "front.jpg",
"BackImage": "back.jpg",
},
validate: func(t *testing.T, g *models.ScrapedGroup) {
assert.NotNil(t, g)
assert.Equal(t, "Group Title", *g.Name)
assert.Equal(t, "Group Alias", *g.Aliases)
assert.Equal(t, "https://example.com/group", *g.URL)
assert.Equal(t, []string{"url1", "url2"}, g.URLs)
assert.Equal(t, "240 minutes", *g.Duration)
assert.Equal(t, "2020-08-15", *g.Date)
assert.Equal(t, "Jane Director", *g.Director)
assert.Equal(t, "Group synopsis", *g.Synopsis)
assert.Equal(t, "front.jpg", *g.FrontImage)
assert.Equal(t, "back.jpg", *g.BackImage)
},
},
{
name: "minimal group",
data: mappedResult{},
validate: func(t *testing.T, g *models.ScrapedGroup) {
assert.NotNil(t, g)
assert.Nil(t, g.Name)
assert.Empty(t, g.URLs)
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
group := test.data.scrapedGroup()
test.validate(t, group)
})
}
}
// Test scrapedGroups method
func TestMappedResultsScrapedGroups(t *testing.T) {
tests := []struct {
name string
data mappedResults
expectedCount int
}{
{
name: "empty results",
data: mappedResults{},
expectedCount: 0,
},
{
name: "single group",
data: mappedResults{
mappedResult{"Name": "Group 1"},
},
expectedCount: 1,
},
{
name: "multiple groups",
data: mappedResults{
mappedResult{"Name": "Group 1"},
mappedResult{"Name": "Group 2"},
mappedResult{"Name": "Group 3"},
},
expectedCount: 3,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
groups := test.data.scrapedGroups()
if test.expectedCount == 0 {
assert.Nil(t, groups)
} else {
assert.NotNil(t, groups)
assert.Equal(t, test.expectedCount, len(groups))
}
})
}
}
// Helper functions
func strPtr(s string) *string {
return &s
}
func intPtr(i int) *int {
return &i
}

View file

@ -25,7 +25,7 @@ xPathScrapers:
- anything
`
c := &config{}
c := &Definition{}
err := yaml.Unmarshal([]byte(yamlStr), &c)
if err == nil {

View file

@ -110,7 +110,7 @@ func (p queryURLParameters) constructURL(url string) string {
}
// replaceURL does a partial URL Replace ( only url parameter is used)
func replaceURL(url string, scraperConfig scraperTypeConfig) string {
func replaceURL(url string, scraperConfig ByURLDefinition) string {
u := url
queryURL := queryURLParameterFromURL(u)
if scraperConfig.QueryURLReplacements != nil {

View file

@ -208,22 +208,11 @@ func galleryInputFromGallery(gallery *models.Gallery) galleryInput {
var ErrScraperScript = errors.New("scraper script error")
type scriptScraper struct {
scraper scraperTypeConfig
config config
definition Definition
globalConfig GlobalConfig
}
func newScriptScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *scriptScraper {
return &scriptScraper{
scraper: scraper,
config: config,
globalConfig: globalConfig,
}
}
func (s *scriptScraper) runScraperScript(ctx context.Context, inString string, out interface{}) error {
command := s.scraper.Script
func (s *scriptScraper) runScraperScript(ctx context.Context, command []string, inString string, out interface{}) error {
var cmd *exec.Cmd
if python.IsPythonCommand(command[0]) {
pythonPath := s.globalConfig.GetPythonPath()
@ -233,7 +222,7 @@ func (s *scriptScraper) runScraperScript(ctx context.Context, inString string, o
logger.Warnf("%s", err)
} else {
cmd = p.Command(ctx, command[1:])
envVariable, _ := filepath.Abs(filepath.Dir(filepath.Dir(s.config.path)))
envVariable, _ := filepath.Abs(filepath.Dir(filepath.Dir(s.definition.path)))
python.AppendPythonPath(cmd, envVariable)
}
}
@ -243,7 +232,7 @@ func (s *scriptScraper) runScraperScript(ctx context.Context, inString string, o
cmd = stashExec.CommandContext(ctx, command[0], command[1:]...)
}
cmd.Dir = filepath.Dir(s.config.path)
cmd.Dir = filepath.Dir(s.definition.path)
stdin, err := cmd.StdinPipe()
if err != nil {
@ -273,7 +262,7 @@ func (s *scriptScraper) runScraperScript(ctx context.Context, inString string, o
return errors.New("error running scraper script")
}
go handleScraperStderr(s.config.Name, stderr)
go handleScraperStderr(s.definition.Name, stderr)
logger.Debugf("Scraper script <%s> started", strings.Join(cmd.Args, " "))
@ -312,7 +301,39 @@ func (s *scriptScraper) runScraperScript(ctx context.Context, inString string, o
return nil
}
func (s *scriptScraper) scrapeByName(ctx context.Context, name string, ty ScrapeContentType) ([]ScrapedContent, error) {
func (s *scriptScraper) scrape(ctx context.Context, command []string, input string, ty ScrapeContentType) (ScrapedContent, error) {
switch ty {
case ScrapeContentTypePerformer:
var performer *models.ScrapedPerformer
err := s.runScraperScript(ctx, command, input, &performer)
return performer, err
case ScrapeContentTypeGallery:
var gallery *models.ScrapedGallery
err := s.runScraperScript(ctx, command, input, &gallery)
return gallery, err
case ScrapeContentTypeScene:
var scene *models.ScrapedScene
err := s.runScraperScript(ctx, command, input, &scene)
return scene, err
case ScrapeContentTypeMovie, ScrapeContentTypeGroup:
var movie *models.ScrapedMovie
err := s.runScraperScript(ctx, command, input, &movie)
return movie, err
case ScrapeContentTypeImage:
var image *models.ScrapedImage
err := s.runScraperScript(ctx, command, input, &image)
return image, err
}
return nil, ErrNotSupported
}
type scriptNameScraper struct {
scriptScraper
definition ByNameDefinition
}
func (s *scriptNameScraper) scrapeByName(ctx context.Context, name string, ty ScrapeContentType) ([]ScrapedContent, error) {
input := `{"name": "` + name + `"}`
var ret []ScrapedContent
@ -320,7 +341,7 @@ func (s *scriptScraper) scrapeByName(ctx context.Context, name string, ty Scrape
switch ty {
case ScrapeContentTypePerformer:
var performers []models.ScrapedPerformer
err = s.runScraperScript(ctx, input, &performers)
err = s.runScraperScript(ctx, s.definition.Script, input, &performers)
if err == nil {
for _, p := range performers {
v := p
@ -329,7 +350,7 @@ func (s *scriptScraper) scrapeByName(ctx context.Context, name string, ty Scrape
}
case ScrapeContentTypeScene:
var scenes []models.ScrapedScene
err = s.runScraperScript(ctx, input, &scenes)
err = s.runScraperScript(ctx, s.definition.Script, input, &scenes)
if err == nil {
for _, s := range scenes {
v := s
@ -343,7 +364,21 @@ func (s *scriptScraper) scrapeByName(ctx context.Context, name string, ty Scrape
return ret, err
}
func (s *scriptScraper) scrapeByFragment(ctx context.Context, input Input) (ScrapedContent, error) {
type scriptURLScraper struct {
scriptScraper
definition ByURLDefinition
}
func (s *scriptURLScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error) {
return s.scrape(ctx, s.definition.Script, `{"url": "`+url+`"}`, ty)
}
type scriptFragmentScraper struct {
scriptScraper
definition ByFragmentDefinition
}
func (s *scriptFragmentScraper) scrapeByFragment(ctx context.Context, input Input) (ScrapedContent, error) {
var inString []byte
var err error
var ty ScrapeContentType
@ -363,41 +398,10 @@ func (s *scriptScraper) scrapeByFragment(ctx context.Context, input Input) (Scra
return nil, err
}
return s.scrape(ctx, string(inString), ty)
return s.scrape(ctx, s.definition.Script, string(inString), ty)
}
func (s *scriptScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error) {
return s.scrape(ctx, `{"url": "`+url+`"}`, ty)
}
func (s *scriptScraper) scrape(ctx context.Context, input string, ty ScrapeContentType) (ScrapedContent, error) {
switch ty {
case ScrapeContentTypePerformer:
var performer *models.ScrapedPerformer
err := s.runScraperScript(ctx, input, &performer)
return performer, err
case ScrapeContentTypeGallery:
var gallery *models.ScrapedGallery
err := s.runScraperScript(ctx, input, &gallery)
return gallery, err
case ScrapeContentTypeScene:
var scene *models.ScrapedScene
err := s.runScraperScript(ctx, input, &scene)
return scene, err
case ScrapeContentTypeMovie, ScrapeContentTypeGroup:
var movie *models.ScrapedMovie
err := s.runScraperScript(ctx, input, &movie)
return movie, err
case ScrapeContentTypeImage:
var image *models.ScrapedImage
err := s.runScraperScript(ctx, input, &image)
return image, err
}
return nil, ErrNotSupported
}
func (s *scriptScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error) {
func (s *scriptFragmentScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error) {
inString, err := json.Marshal(sceneInputFromScene(scene))
if err != nil {
@ -406,12 +410,12 @@ func (s *scriptScraper) scrapeSceneByScene(ctx context.Context, scene *models.Sc
var ret *models.ScrapedScene
err = s.runScraperScript(ctx, string(inString), &ret)
err = s.runScraperScript(ctx, s.definition.Script, string(inString), &ret)
return ret, err
}
func (s *scriptScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*models.ScrapedGallery, error) {
func (s *scriptFragmentScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*models.ScrapedGallery, error) {
inString, err := json.Marshal(galleryInputFromGallery(gallery))
if err != nil {
@ -420,12 +424,12 @@ func (s *scriptScraper) scrapeGalleryByGallery(ctx context.Context, gallery *mod
var ret *models.ScrapedGallery
err = s.runScraperScript(ctx, string(inString), &ret)
err = s.runScraperScript(ctx, s.definition.Script, string(inString), &ret)
return ret, err
}
func (s *scriptScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*models.ScrapedImage, error) {
func (s *scriptFragmentScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*models.ScrapedImage, error) {
inString, err := json.Marshal(imageToUpdateInput(image))
if err != nil {
@ -434,7 +438,7 @@ func (s *scriptScraper) scrapeImageByImage(ctx context.Context, image *models.Im
var ret *models.ScrapedImage
err = s.runScraperScript(ctx, string(inString), &ret)
err = s.runScraperScript(ctx, s.definition.Script, string(inString), &ret)
return ret, err
}

View file

@ -14,15 +14,13 @@ import (
)
type stashScraper struct {
scraper scraperTypeConfig
config config
config Definition
globalConfig GlobalConfig
client *http.Client
}
func newStashScraper(scraper scraperTypeConfig, client *http.Client, config config, globalConfig GlobalConfig) *stashScraper {
func newStashScraper(client *http.Client, config Definition, globalConfig GlobalConfig) *stashScraper {
return &stashScraper{
scraper: scraper,
config: config,
client: client,
globalConfig: globalConfig,

View file

@ -25,8 +25,8 @@ import (
const scrapeDefaultSleep = time.Second * 2
func loadURL(ctx context.Context, loadURL string, client *http.Client, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) {
driverOptions := scraperConfig.DriverOptions
func loadURL(ctx context.Context, loadURL string, client *http.Client, def Definition, globalConfig GlobalConfig) (io.Reader, error) {
driverOptions := def.DriverOptions
if driverOptions != nil && driverOptions.UseCDP {
// get the page using chrome dp
return urlFromCDP(ctx, loadURL, *driverOptions, globalConfig)
@ -37,7 +37,7 @@ func loadURL(ctx context.Context, loadURL string, client *http.Client, scraperCo
return nil, err
}
jar, err := scraperConfig.jar()
jar, err := def.jar()
if err != nil {
return nil, fmt.Errorf("error creating cookie jar: %w", err)
}
@ -83,7 +83,7 @@ func loadURL(ctx context.Context, loadURL string, client *http.Client, scraperCo
}
bodyReader := bytes.NewReader(body)
printCookies(jar, scraperConfig, "Jar cookies found for scraper urls")
printCookies(jar, def, "Jar cookies found for scraper urls")
return charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
}

View file

@ -3,7 +3,6 @@ package scraper
import (
"bytes"
"context"
"errors"
"fmt"
"net/http"
"net/url"
@ -19,49 +18,36 @@ import (
)
type xpathScraper struct {
scraper scraperTypeConfig
config config
definition Definition
globalConfig GlobalConfig
client *http.Client
}
func newXpathScraper(scraper scraperTypeConfig, client *http.Client, config config, globalConfig GlobalConfig) *xpathScraper {
return &xpathScraper{
scraper: scraper,
config: config,
globalConfig: globalConfig,
client: client,
func (s *xpathScraper) getXpathScraper(name string) (*mappedScraper, error) {
ret, ok := s.definition.XPathScrapers[name]
if !ok {
return nil, fmt.Errorf("xpath scraper with name %s not found in config", name)
}
return &ret, nil
}
func (s *xpathScraper) getXpathScraper() *mappedScraper {
return s.config.XPathScrapers[s.scraper.Scraper]
type xpathURLScraper struct {
xpathScraper
definition ByURLDefinition
}
func (s *xpathScraper) scrapeURL(ctx context.Context, url string) (*html.Node, *mappedScraper, error) {
scraper := s.getXpathScraper()
if scraper == nil {
return nil, nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
doc, err := s.loadURL(ctx, url)
if err != nil {
return nil, nil, err
}
return doc, scraper, nil
}
func (s *xpathScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error) {
u := replaceURL(url, s.scraper) // allow a URL Replace for performer by URL queries
doc, scraper, err := s.scrapeURL(ctx, u)
func (s *xpathURLScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error) {
scraper, err := s.getXpathScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc, u)
doc, err := s.loadURL(ctx, url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc, url)
// if these just return the return values from scraper.scrape* functions then
// it ends up returning ScrapedContent(nil) rather than nil
switch ty {
@ -100,11 +86,15 @@ func (s *xpathScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeCon
return nil, ErrNotSupported
}
func (s *xpathScraper) scrapeByName(ctx context.Context, name string, ty ScrapeContentType) ([]ScrapedContent, error) {
scraper := s.getXpathScraper()
type xpathNameScraper struct {
xpathScraper
definition ByNameDefinition
}
if scraper == nil {
return nil, fmt.Errorf("%w: name %v", ErrNotFound, s.scraper.Scraper)
func (s *xpathNameScraper) scrapeByName(ctx context.Context, name string, ty ScrapeContentType) ([]ScrapedContent, error) {
scraper, err := s.getXpathScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
const placeholder = "{}"
@ -112,7 +102,7 @@ func (s *xpathScraper) scrapeByName(ctx context.Context, name string, ty ScrapeC
// replace the placeholder string with the URL-escaped name
escapedName := url.QueryEscape(name)
url := s.scraper.QueryURL
url := s.definition.QueryURL
url = strings.ReplaceAll(url, placeholder, escapedName)
doc, err := s.loadURL(ctx, url)
@ -151,18 +141,22 @@ func (s *xpathScraper) scrapeByName(ctx context.Context, name string, ty ScrapeC
return nil, ErrNotSupported
}
func (s *xpathScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error) {
type xpathFragmentScraper struct {
xpathScraper
definition ByFragmentDefinition
}
func (s *xpathFragmentScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error) {
// construct the URL
queryURL := queryURLParametersFromScene(scene)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
if s.definition.QueryURLReplacements != nil {
queryURL.applyReplacements(s.definition.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
url := queryURL.constructURL(s.definition.QueryURL)
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
scraper, err := s.getXpathScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
doc, err := s.loadURL(ctx, url)
@ -175,7 +169,7 @@ func (s *xpathScraper) scrapeSceneByScene(ctx context.Context, scene *models.Sce
return scraper.scrapeScene(ctx, q)
}
func (s *xpathScraper) scrapeByFragment(ctx context.Context, input Input) (ScrapedContent, error) {
func (s *xpathFragmentScraper) scrapeByFragment(ctx context.Context, input Input) (ScrapedContent, error) {
switch {
case input.Gallery != nil:
return nil, fmt.Errorf("%w: cannot use an xpath scraper as a gallery fragment scraper", ErrNotSupported)
@ -189,15 +183,14 @@ func (s *xpathScraper) scrapeByFragment(ctx context.Context, input Input) (Scrap
// construct the URL
queryURL := queryURLParametersFromScrapedScene(scene)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
if s.definition.QueryURLReplacements != nil {
queryURL.applyReplacements(s.definition.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
url := queryURL.constructURL(s.definition.QueryURL)
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
scraper, err := s.getXpathScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
doc, err := s.loadURL(ctx, url)
@ -210,18 +203,17 @@ func (s *xpathScraper) scrapeByFragment(ctx context.Context, input Input) (Scrap
return scraper.scrapeScene(ctx, q)
}
func (s *xpathScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*models.ScrapedGallery, error) {
func (s *xpathFragmentScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*models.ScrapedGallery, error) {
// construct the URL
queryURL := queryURLParametersFromGallery(gallery)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
if s.definition.QueryURLReplacements != nil {
queryURL.applyReplacements(s.definition.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
url := queryURL.constructURL(s.definition.QueryURL)
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
scraper, err := s.getXpathScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
doc, err := s.loadURL(ctx, url)
@ -234,18 +226,17 @@ func (s *xpathScraper) scrapeGalleryByGallery(ctx context.Context, gallery *mode
return scraper.scrapeGallery(ctx, q)
}
func (s *xpathScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*models.ScrapedImage, error) {
func (s *xpathFragmentScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*models.ScrapedImage, error) {
// construct the URL
queryURL := queryURLParametersFromImage(image)
if s.scraper.QueryURLReplacements != nil {
queryURL.applyReplacements(s.scraper.QueryURLReplacements)
if s.definition.QueryURLReplacements != nil {
queryURL.applyReplacements(s.definition.QueryURLReplacements)
}
url := queryURL.constructURL(s.scraper.QueryURL)
url := queryURL.constructURL(s.definition.QueryURL)
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
scraper, err := s.getXpathScraper(s.definition.Scraper)
if err != nil {
return nil, err
}
doc, err := s.loadURL(ctx, url)
@ -259,14 +250,14 @@ func (s *xpathScraper) scrapeImageByImage(ctx context.Context, image *models.Ima
}
func (s *xpathScraper) loadURL(ctx context.Context, url string) (*html.Node, error) {
r, err := loadURL(ctx, url, s.client, s.config, s.globalConfig)
r, err := loadURL(ctx, url, s.client, s.definition, s.globalConfig)
if err != nil {
return nil, fmt.Errorf("failed to load URL %q: %w", url, err)
}
ret, err := html.Parse(r)
if err == nil && s.config.DebugOptions != nil && s.config.DebugOptions.PrintHTML {
if err == nil && s.definition.DebugOptions != nil && s.definition.DebugOptions.PrintHTML {
var b bytes.Buffer
if err := html.Render(&b, ret); err != nil {
logger.Warnf("could not render HTML: %v", err)

View file

@ -674,10 +674,10 @@ func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []strin
}
if expectedName != actualName {
t.Errorf("Expected performer name %s, got %s", expectedName, actualName)
t.Errorf("Expected performer name %q, got %q", expectedName, actualName)
}
if expectedURL != actualURL {
t.Errorf("Expected performer URL %s, got %s", expectedName, actualName)
t.Errorf("Expected performer URL %q, got %q", expectedURL, actualURL)
}
i++
}
@ -780,7 +780,7 @@ xPathScrapers:
Name: //studio
`
c := &config{}
c := &Definition{}
err := yaml.Unmarshal([]byte(yamlStr), &c)
if err != nil {
@ -892,7 +892,7 @@ xPathScrapers:
selector: //span
`
c := &config{}
c := &Definition{}
err := yaml.Unmarshal([]byte(yamlStr), &c)
if err != nil {
@ -904,12 +904,8 @@ xPathScrapers:
client := &http.Client{}
ctx := context.Background()
s := newGroupScraper(*c, globalConfig)
us, ok := s.(urlScraper)
if !ok {
t.Error("couldn't convert scraper into url scraper")
}
content, err := us.viaURL(ctx, client, ts.URL, ScrapeContentTypePerformer)
s := scraperFromDefinition(*c, globalConfig)
content, err := s.viaURL(ctx, client, ts.URL, ScrapeContentTypePerformer)
if err != nil {
t.Errorf("Error scraping performer: %s", err.Error())