mirror of
https://github.com/stashapp/stash.git
synced 2025-12-08 17:35:40 +01:00
* Make the script scraper context-aware Connect the context to the command execution. This means command execution can be aborted if the context is canceled. The context is usually bound to user-interaction, i.e., a scraper operation issued by the user. Hence, it seems correct to abort a command if the user aborts. * Enable errchkjson Some json marshal calls are *safe* in that they can never fail. This is conditional on the types of the the data being encoded. errchkjson finds those calls which are unsafe, and also not checked for errors. Add logging warnings to the place where unsafe encodings might happen. This can help uncover usage bugs early in stash if they are tripped, making debugging easier. While here, keep the checker enabled in the linter to capture future uses of json marshalling. * Pass the context for zip file scanning. * Pass the context in scanning * Pass context, replace context.TODO() Where applicable, pass the context down toward the lower functions in the call stack. Replace uses of context.TODO() with the passed context. This makes the code more context-aware, and you can rely on aborting contexts to clean up subsystems to a far greater extent now. I've left the cases where there is a context in a struct. My gut feeling is that they have solutions that are nice, but they require more deep thinking to unveil how to handle it. * Remove context from task-structs As a rule, contexts are better passed explicitly to functions than they are passed implicitly via structs. In the case of tasks, we already have a valid context in scope when creating the struct, so remove ctx from the struct and use the scoped context instead. With this change it is clear that the scanning functions are under a context, and the task-starting caller has jurisdiction over the context and its lifetime. A reader of the code don't have to figure out where the context are coming from anymore. While here, connect context.TODO() to the newly scoped context in most of the scan code. * Remove context from autotag struct too * Make more context-passing explicit In all of these cases, there is an applicable context which is close in the call-tree. Hook up to this context. * Simplify context passing in manager The managers context handling generally wants to use an outer context if applicable. However, the code doesn't pass it explicitly, but stores it in a struct. Pull out the context from the struct and use it to explicitly pass it. At a later point in time, we probably want to handle this by handing over the job to a different (program-lifetime) context for background jobs, but this will do for a start.
246 lines
6.2 KiB
Go
246 lines
6.2 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
stashExec "github.com/stashapp/stash/pkg/exec"
|
|
"github.com/stashapp/stash/pkg/logger"
|
|
"github.com/stashapp/stash/pkg/models"
|
|
"github.com/stashapp/stash/pkg/python"
|
|
)
|
|
|
|
var ErrScraperScript = errors.New("scraper script error")
|
|
|
|
type scriptScraper struct {
|
|
scraper scraperTypeConfig
|
|
config config
|
|
globalConfig GlobalConfig
|
|
}
|
|
|
|
func newScriptScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *scriptScraper {
|
|
return &scriptScraper{
|
|
scraper: scraper,
|
|
config: config,
|
|
globalConfig: globalConfig,
|
|
}
|
|
}
|
|
|
|
func (s *scriptScraper) runScraperScript(ctx context.Context, inString string, out interface{}) error {
|
|
command := s.scraper.Script
|
|
|
|
var cmd *exec.Cmd
|
|
if python.IsPythonCommand(command[0]) {
|
|
pythonPath := s.globalConfig.GetPythonPath()
|
|
var p *python.Python
|
|
if pythonPath != "" {
|
|
p = python.New(pythonPath)
|
|
} else {
|
|
p, _ = python.Resolve()
|
|
}
|
|
|
|
if p != nil {
|
|
cmd = p.Command(ctx, command[1:])
|
|
}
|
|
|
|
// if could not find python, just use the command args as-is
|
|
}
|
|
|
|
if cmd == nil {
|
|
cmd = stashExec.Command(command[0], command[1:]...)
|
|
}
|
|
|
|
cmd.Dir = filepath.Dir(s.config.path)
|
|
|
|
stdin, err := cmd.StdinPipe()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
go func() {
|
|
defer stdin.Close()
|
|
|
|
if n, err := io.WriteString(stdin, inString); err != nil {
|
|
logger.Warnf("failure to write full input to script (wrote %v bytes out of %v): %v", n, len(inString), err)
|
|
}
|
|
}()
|
|
|
|
stderr, err := cmd.StderrPipe()
|
|
if err != nil {
|
|
logger.Error("Scraper stderr not available: " + err.Error())
|
|
}
|
|
|
|
stdout, err := cmd.StdoutPipe()
|
|
if nil != err {
|
|
logger.Error("Scraper stdout not available: " + err.Error())
|
|
}
|
|
|
|
if err = cmd.Start(); err != nil {
|
|
logger.Error("Error running scraper script: " + err.Error())
|
|
return errors.New("error running scraper script")
|
|
}
|
|
|
|
go handleScraperStderr(s.config.Name, stderr)
|
|
|
|
logger.Debugf("Scraper script <%s> started", strings.Join(cmd.Args, " "))
|
|
|
|
// TODO - add a timeout here
|
|
// Make a copy of stdout here. This allows us to decode it twice.
|
|
var sb strings.Builder
|
|
tr := io.TeeReader(stdout, &sb)
|
|
|
|
// First, perform a decode where unknown fields are disallowed.
|
|
d := json.NewDecoder(tr)
|
|
d.DisallowUnknownFields()
|
|
strictErr := d.Decode(out)
|
|
|
|
if strictErr != nil {
|
|
// The decode failed for some reason, use the built string
|
|
// and allow unknown fields in the decode.
|
|
s := sb.String()
|
|
lenientErr := json.NewDecoder(strings.NewReader(s)).Decode(out)
|
|
if lenientErr != nil {
|
|
// The error is genuine, so return it
|
|
logger.Errorf("could not unmarshal json from script output: %v", lenientErr)
|
|
return fmt.Errorf("could not unmarshal json from script output: %w", lenientErr)
|
|
}
|
|
|
|
// Lenient decode succeeded, print a warning, but use the decode
|
|
logger.Warnf("reading script result: %v", strictErr)
|
|
}
|
|
|
|
err = cmd.Wait()
|
|
logger.Debugf("Scraper script finished")
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("%w: %v", ErrScraperScript, err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *scriptScraper) scrapeByName(ctx context.Context, name string, ty models.ScrapeContentType) ([]models.ScrapedContent, error) {
|
|
input := `{"name": "` + name + `"}`
|
|
|
|
var ret []models.ScrapedContent
|
|
var err error
|
|
switch ty {
|
|
case models.ScrapeContentTypePerformer:
|
|
var performers []models.ScrapedPerformer
|
|
err = s.runScraperScript(ctx, input, &performers)
|
|
if err == nil {
|
|
for _, p := range performers {
|
|
v := p
|
|
ret = append(ret, &v)
|
|
}
|
|
}
|
|
case models.ScrapeContentTypeScene:
|
|
var scenes []models.ScrapedScene
|
|
err = s.runScraperScript(ctx, input, &scenes)
|
|
if err == nil {
|
|
for _, s := range scenes {
|
|
v := s
|
|
ret = append(ret, &v)
|
|
}
|
|
}
|
|
default:
|
|
return nil, ErrNotSupported
|
|
}
|
|
|
|
return ret, err
|
|
}
|
|
|
|
func (s *scriptScraper) scrapeByFragment(ctx context.Context, input Input) (models.ScrapedContent, error) {
|
|
var inString []byte
|
|
var err error
|
|
var ty models.ScrapeContentType
|
|
switch {
|
|
case input.Performer != nil:
|
|
inString, err = json.Marshal(*input.Performer)
|
|
ty = models.ScrapeContentTypePerformer
|
|
case input.Gallery != nil:
|
|
inString, err = json.Marshal(*input.Gallery)
|
|
ty = models.ScrapeContentTypeGallery
|
|
case input.Scene != nil:
|
|
inString, err = json.Marshal(*input.Scene)
|
|
ty = models.ScrapeContentTypeScene
|
|
}
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return s.scrape(ctx, string(inString), ty)
|
|
}
|
|
|
|
func (s *scriptScraper) scrapeByURL(ctx context.Context, url string, ty models.ScrapeContentType) (models.ScrapedContent, error) {
|
|
return s.scrape(ctx, `{"url": "`+url+`"}`, ty)
|
|
}
|
|
|
|
func (s *scriptScraper) scrape(ctx context.Context, input string, ty models.ScrapeContentType) (models.ScrapedContent, error) {
|
|
switch ty {
|
|
case models.ScrapeContentTypePerformer:
|
|
var performer *models.ScrapedPerformer
|
|
err := s.runScraperScript(ctx, input, &performer)
|
|
return performer, err
|
|
case models.ScrapeContentTypeGallery:
|
|
var gallery *models.ScrapedGallery
|
|
err := s.runScraperScript(ctx, input, &gallery)
|
|
return gallery, err
|
|
case models.ScrapeContentTypeScene:
|
|
var scene *models.ScrapedScene
|
|
err := s.runScraperScript(ctx, input, &scene)
|
|
return scene, err
|
|
case models.ScrapeContentTypeMovie:
|
|
var movie *models.ScrapedMovie
|
|
err := s.runScraperScript(ctx, input, &movie)
|
|
return movie, err
|
|
}
|
|
|
|
return nil, ErrNotSupported
|
|
}
|
|
|
|
func (s *scriptScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error) {
|
|
inString, err := json.Marshal(sceneToUpdateInput(scene))
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var ret *models.ScrapedScene
|
|
|
|
err = s.runScraperScript(ctx, string(inString), &ret)
|
|
|
|
return ret, err
|
|
}
|
|
|
|
func (s *scriptScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*models.ScrapedGallery, error) {
|
|
inString, err := json.Marshal(galleryToUpdateInput(gallery))
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var ret *models.ScrapedGallery
|
|
|
|
err = s.runScraperScript(ctx, string(inString), &ret)
|
|
|
|
return ret, err
|
|
}
|
|
|
|
func handleScraperStderr(name string, scraperOutputReader io.ReadCloser) {
|
|
const scraperPrefix = "[Scrape / %s] "
|
|
|
|
lgr := logger.PluginLogger{
|
|
Logger: logger.Logger,
|
|
Prefix: fmt.Sprintf(scraperPrefix, name),
|
|
DefaultLogLevel: &logger.ErrorLevel,
|
|
}
|
|
lgr.ReadLogMessages(scraperOutputReader)
|
|
}
|