mirror of
https://github.com/stashapp/stash.git
synced 2026-02-08 00:12:55 +01:00
* Remove reflection from mapped value processing * AI generated unit tests * Move mappedConfig to separate file * Rename group to configScraper * Separate mapped post-processing code into separate file * Update test after group rename * Check map entry when returning scraper * Refactor config into definition * Support single string for string slice translation * Rename config.go to definition.go * Rename configScraper to definedScraper * Rename config_scraper.go to defined_scraper.go
334 lines
7.5 KiB
Go
334 lines
7.5 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
|
|
"github.com/stashapp/stash/pkg/logger"
|
|
"github.com/stashapp/stash/pkg/models"
|
|
"github.com/tidwall/gjson"
|
|
)
|
|
|
|
type jsonScraper struct {
|
|
definition Definition
|
|
globalConfig GlobalConfig
|
|
client *http.Client
|
|
}
|
|
|
|
func (s *jsonScraper) getJsonScraper(name string) (*mappedScraper, error) {
|
|
ret, ok := s.definition.JsonScrapers[name]
|
|
if !ok {
|
|
return nil, fmt.Errorf("json scraper with name %s not found in config", name)
|
|
}
|
|
|
|
return &ret, nil
|
|
}
|
|
|
|
func (s *jsonScraper) loadURL(ctx context.Context, url string) (string, error) {
|
|
r, err := loadURL(ctx, url, s.client, s.definition, s.globalConfig)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
logger.Infof("loadURL (%s)\n", url)
|
|
doc, err := io.ReadAll(r)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
docStr := string(doc)
|
|
if !gjson.Valid(docStr) {
|
|
return "", errors.New("not valid json")
|
|
}
|
|
|
|
if s.definition.DebugOptions != nil && s.definition.DebugOptions.PrintHTML {
|
|
logger.Infof("loadURL (%s) response: \n%s", url, docStr)
|
|
}
|
|
|
|
return docStr, err
|
|
}
|
|
|
|
type jsonURLScraper struct {
|
|
jsonScraper
|
|
definition ByURLDefinition
|
|
}
|
|
|
|
func (s *jsonURLScraper) scrapeByURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error) {
|
|
scraper, err := s.getJsonScraper(s.definition.Scraper)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
doc, err := s.loadURL(ctx, url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
q := s.getJsonQuery(doc, url)
|
|
// if these just return the return values from scraper.scrape* functions then
|
|
// it ends up returning ScrapedContent(nil) rather than nil
|
|
switch ty {
|
|
case ScrapeContentTypePerformer:
|
|
ret, err := scraper.scrapePerformer(ctx, q)
|
|
if err != nil || ret == nil {
|
|
return nil, err
|
|
}
|
|
return ret, nil
|
|
case ScrapeContentTypeScene:
|
|
ret, err := scraper.scrapeScene(ctx, q)
|
|
if err != nil || ret == nil {
|
|
return nil, err
|
|
}
|
|
return ret, nil
|
|
case ScrapeContentTypeGallery:
|
|
ret, err := scraper.scrapeGallery(ctx, q)
|
|
if err != nil || ret == nil {
|
|
return nil, err
|
|
}
|
|
return ret, nil
|
|
case ScrapeContentTypeImage:
|
|
ret, err := scraper.scrapeImage(ctx, q)
|
|
if err != nil || ret == nil {
|
|
return nil, err
|
|
}
|
|
return ret, nil
|
|
case ScrapeContentTypeMovie, ScrapeContentTypeGroup:
|
|
ret, err := scraper.scrapeGroup(ctx, q)
|
|
if err != nil || ret == nil {
|
|
return nil, err
|
|
}
|
|
return ret, nil
|
|
}
|
|
|
|
return nil, ErrNotSupported
|
|
}
|
|
|
|
type jsonNameScraper struct {
|
|
jsonScraper
|
|
definition ByNameDefinition
|
|
}
|
|
|
|
func (s *jsonNameScraper) scrapeByName(ctx context.Context, name string, ty ScrapeContentType) ([]ScrapedContent, error) {
|
|
scraper, err := s.getJsonScraper(s.definition.Scraper)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
const placeholder = "{}"
|
|
|
|
// replace the placeholder string with the URL-escaped name
|
|
escapedName := url.QueryEscape(name)
|
|
|
|
url := s.definition.QueryURL
|
|
url = strings.ReplaceAll(url, placeholder, escapedName)
|
|
|
|
doc, err := s.loadURL(ctx, url)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
q := s.getJsonQuery(doc, url)
|
|
q.setType(SearchQuery)
|
|
|
|
var content []ScrapedContent
|
|
switch ty {
|
|
case ScrapeContentTypePerformer:
|
|
performers, err := scraper.scrapePerformers(ctx, q)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, p := range performers {
|
|
content = append(content, p)
|
|
}
|
|
|
|
return content, nil
|
|
case ScrapeContentTypeScene:
|
|
scenes, err := scraper.scrapeScenes(ctx, q)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, s := range scenes {
|
|
content = append(content, s)
|
|
}
|
|
|
|
return content, nil
|
|
}
|
|
|
|
return nil, ErrNotSupported
|
|
}
|
|
|
|
type jsonFragmentScraper struct {
|
|
jsonScraper
|
|
definition ByFragmentDefinition
|
|
}
|
|
|
|
func (s *jsonFragmentScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error) {
|
|
// construct the URL
|
|
queryURL := queryURLParametersFromScene(scene)
|
|
if s.definition.QueryURLReplacements != nil {
|
|
queryURL.applyReplacements(s.definition.QueryURLReplacements)
|
|
}
|
|
url := queryURL.constructURL(s.definition.QueryURL)
|
|
|
|
scraper, err := s.getJsonScraper(s.definition.Scraper)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
doc, err := s.loadURL(ctx, url)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
q := s.getJsonQuery(doc, url)
|
|
return scraper.scrapeScene(ctx, q)
|
|
}
|
|
|
|
func (s *jsonFragmentScraper) scrapeByFragment(ctx context.Context, input Input) (ScrapedContent, error) {
|
|
switch {
|
|
case input.Gallery != nil:
|
|
return nil, fmt.Errorf("%w: cannot use a json scraper as a gallery fragment scraper", ErrNotSupported)
|
|
case input.Performer != nil:
|
|
return nil, fmt.Errorf("%w: cannot use a json scraper as a performer fragment scraper", ErrNotSupported)
|
|
case input.Scene == nil:
|
|
return nil, fmt.Errorf("%w: scene input is nil", ErrNotSupported)
|
|
}
|
|
|
|
scene := *input.Scene
|
|
|
|
// construct the URL
|
|
queryURL := queryURLParametersFromScrapedScene(scene)
|
|
if s.definition.QueryURLReplacements != nil {
|
|
queryURL.applyReplacements(s.definition.QueryURLReplacements)
|
|
}
|
|
url := queryURL.constructURL(s.definition.QueryURL)
|
|
|
|
scraper, err := s.getJsonScraper(s.definition.Scraper)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
doc, err := s.loadURL(ctx, url)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
q := s.getJsonQuery(doc, url)
|
|
return scraper.scrapeScene(ctx, q)
|
|
}
|
|
|
|
func (s *jsonFragmentScraper) scrapeImageByImage(ctx context.Context, image *models.Image) (*models.ScrapedImage, error) {
|
|
// construct the URL
|
|
queryURL := queryURLParametersFromImage(image)
|
|
if s.definition.QueryURLReplacements != nil {
|
|
queryURL.applyReplacements(s.definition.QueryURLReplacements)
|
|
}
|
|
url := queryURL.constructURL(s.definition.QueryURL)
|
|
|
|
scraper, err := s.getJsonScraper(s.definition.Scraper)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
doc, err := s.loadURL(ctx, url)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
q := s.getJsonQuery(doc, url)
|
|
return scraper.scrapeImage(ctx, q)
|
|
}
|
|
|
|
func (s *jsonFragmentScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*models.ScrapedGallery, error) {
|
|
// construct the URL
|
|
queryURL := queryURLParametersFromGallery(gallery)
|
|
if s.definition.QueryURLReplacements != nil {
|
|
queryURL.applyReplacements(s.definition.QueryURLReplacements)
|
|
}
|
|
url := queryURL.constructURL(s.definition.QueryURL)
|
|
|
|
scraper, err := s.getJsonScraper(s.definition.Scraper)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
doc, err := s.loadURL(ctx, url)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
q := s.getJsonQuery(doc, url)
|
|
return scraper.scrapeGallery(ctx, q)
|
|
}
|
|
|
|
func (s *jsonScraper) getJsonQuery(doc string, url string) *jsonQuery {
|
|
return &jsonQuery{
|
|
doc: doc,
|
|
scraper: s,
|
|
url: url,
|
|
}
|
|
}
|
|
|
|
type jsonQuery struct {
|
|
doc string
|
|
scraper *jsonScraper
|
|
queryType QueryType
|
|
url string
|
|
}
|
|
|
|
func (q *jsonQuery) getType() QueryType {
|
|
return q.queryType
|
|
}
|
|
|
|
func (q *jsonQuery) setType(t QueryType) {
|
|
q.queryType = t
|
|
}
|
|
|
|
func (q *jsonQuery) getURL() string {
|
|
return q.url
|
|
}
|
|
|
|
func (q *jsonQuery) runQuery(selector string) ([]string, error) {
|
|
value := gjson.Get(q.doc, selector)
|
|
|
|
if !value.Exists() {
|
|
// many possible reasons why the selector may not be in the json object
|
|
// and not all are errors.
|
|
// Just return nil
|
|
return nil, nil
|
|
}
|
|
|
|
var ret []string
|
|
if value.IsArray() {
|
|
value.ForEach(func(k, v gjson.Result) bool {
|
|
ret = append(ret, v.String())
|
|
return true
|
|
})
|
|
} else {
|
|
ret = append(ret, value.String())
|
|
}
|
|
|
|
return ret, nil
|
|
}
|
|
|
|
func (q *jsonQuery) subScrape(ctx context.Context, value string) mappedQuery {
|
|
doc, err := q.scraper.loadURL(ctx, value)
|
|
|
|
if err != nil {
|
|
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
|
|
return nil
|
|
}
|
|
|
|
return q.scraper.getJsonQuery(doc, value)
|
|
}
|