stash/pkg/scraper/script.go
SmallCoccinelle 401660e6a3
Hoist context, enable errchkjson (#2488)
* Make the script scraper context-aware

Connect the context to the command execution. This means command
execution can be aborted if the context is canceled. The context is
usually bound to user-interaction, i.e., a scraper operation issued
by the user. Hence, it seems correct to abort a command if the user
aborts.

* Enable errchkjson

Some json marshal calls are *safe* in that they can never fail. This is
conditional on the types of the the data being encoded. errchkjson finds
those calls which are unsafe, and also not checked for errors.

Add logging warnings to the place where unsafe encodings might happen.
This can help uncover usage bugs early in stash if they are tripped,
making debugging easier.

While here, keep the checker enabled in the linter to capture future
uses of json marshalling.

* Pass the context for zip file scanning.

* Pass the context in scanning

* Pass context, replace context.TODO()

Where applicable, pass the context down toward the lower functions in
the call stack. Replace uses of context.TODO() with the passed context.

This makes the code more context-aware, and you can rely on aborting
contexts to clean up subsystems to a far greater extent now.

I've left the cases where there is a context in a struct. My gut feeling
is that they have solutions that are nice, but they require more deep
thinking to unveil how to handle it.

* Remove context from task-structs

As a rule, contexts are better passed explicitly to functions than they
are passed implicitly via structs. In the case of tasks, we already
have a valid context in scope when creating the struct, so remove ctx
from the struct and use the scoped context instead.

With this change it is clear that the scanning functions are under a
context, and the task-starting caller has jurisdiction over the context
and its lifetime. A reader of the code don't have to figure out where
the context are coming from anymore.

While here, connect context.TODO() to the newly scoped context in most
of the scan code.

* Remove context from autotag struct too

* Make more context-passing explicit

In all of these cases, there is an applicable context which is close
in the call-tree. Hook up to this context.

* Simplify context passing in manager

The managers context handling generally wants to use an outer context
if applicable. However, the code doesn't pass it explicitly, but stores
it in a struct. Pull out the context from the struct and use it to
explicitly pass it.

At a later point in time, we probably want to handle this by handing
over the job to a different (program-lifetime) context for background
jobs, but this will do for a start.
2022-04-15 11:34:53 +10:00

246 lines
6.2 KiB
Go

package scraper
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"os/exec"
"path/filepath"
"strings"
stashExec "github.com/stashapp/stash/pkg/exec"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/python"
)
var ErrScraperScript = errors.New("scraper script error")
type scriptScraper struct {
scraper scraperTypeConfig
config config
globalConfig GlobalConfig
}
func newScriptScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *scriptScraper {
return &scriptScraper{
scraper: scraper,
config: config,
globalConfig: globalConfig,
}
}
func (s *scriptScraper) runScraperScript(ctx context.Context, inString string, out interface{}) error {
command := s.scraper.Script
var cmd *exec.Cmd
if python.IsPythonCommand(command[0]) {
pythonPath := s.globalConfig.GetPythonPath()
var p *python.Python
if pythonPath != "" {
p = python.New(pythonPath)
} else {
p, _ = python.Resolve()
}
if p != nil {
cmd = p.Command(ctx, command[1:])
}
// if could not find python, just use the command args as-is
}
if cmd == nil {
cmd = stashExec.Command(command[0], command[1:]...)
}
cmd.Dir = filepath.Dir(s.config.path)
stdin, err := cmd.StdinPipe()
if err != nil {
return err
}
go func() {
defer stdin.Close()
if n, err := io.WriteString(stdin, inString); err != nil {
logger.Warnf("failure to write full input to script (wrote %v bytes out of %v): %v", n, len(inString), err)
}
}()
stderr, err := cmd.StderrPipe()
if err != nil {
logger.Error("Scraper stderr not available: " + err.Error())
}
stdout, err := cmd.StdoutPipe()
if nil != err {
logger.Error("Scraper stdout not available: " + err.Error())
}
if err = cmd.Start(); err != nil {
logger.Error("Error running scraper script: " + err.Error())
return errors.New("error running scraper script")
}
go handleScraperStderr(s.config.Name, stderr)
logger.Debugf("Scraper script <%s> started", strings.Join(cmd.Args, " "))
// TODO - add a timeout here
// Make a copy of stdout here. This allows us to decode it twice.
var sb strings.Builder
tr := io.TeeReader(stdout, &sb)
// First, perform a decode where unknown fields are disallowed.
d := json.NewDecoder(tr)
d.DisallowUnknownFields()
strictErr := d.Decode(out)
if strictErr != nil {
// The decode failed for some reason, use the built string
// and allow unknown fields in the decode.
s := sb.String()
lenientErr := json.NewDecoder(strings.NewReader(s)).Decode(out)
if lenientErr != nil {
// The error is genuine, so return it
logger.Errorf("could not unmarshal json from script output: %v", lenientErr)
return fmt.Errorf("could not unmarshal json from script output: %w", lenientErr)
}
// Lenient decode succeeded, print a warning, but use the decode
logger.Warnf("reading script result: %v", strictErr)
}
err = cmd.Wait()
logger.Debugf("Scraper script finished")
if err != nil {
return fmt.Errorf("%w: %v", ErrScraperScript, err)
}
return nil
}
func (s *scriptScraper) scrapeByName(ctx context.Context, name string, ty models.ScrapeContentType) ([]models.ScrapedContent, error) {
input := `{"name": "` + name + `"}`
var ret []models.ScrapedContent
var err error
switch ty {
case models.ScrapeContentTypePerformer:
var performers []models.ScrapedPerformer
err = s.runScraperScript(ctx, input, &performers)
if err == nil {
for _, p := range performers {
v := p
ret = append(ret, &v)
}
}
case models.ScrapeContentTypeScene:
var scenes []models.ScrapedScene
err = s.runScraperScript(ctx, input, &scenes)
if err == nil {
for _, s := range scenes {
v := s
ret = append(ret, &v)
}
}
default:
return nil, ErrNotSupported
}
return ret, err
}
func (s *scriptScraper) scrapeByFragment(ctx context.Context, input Input) (models.ScrapedContent, error) {
var inString []byte
var err error
var ty models.ScrapeContentType
switch {
case input.Performer != nil:
inString, err = json.Marshal(*input.Performer)
ty = models.ScrapeContentTypePerformer
case input.Gallery != nil:
inString, err = json.Marshal(*input.Gallery)
ty = models.ScrapeContentTypeGallery
case input.Scene != nil:
inString, err = json.Marshal(*input.Scene)
ty = models.ScrapeContentTypeScene
}
if err != nil {
return nil, err
}
return s.scrape(ctx, string(inString), ty)
}
func (s *scriptScraper) scrapeByURL(ctx context.Context, url string, ty models.ScrapeContentType) (models.ScrapedContent, error) {
return s.scrape(ctx, `{"url": "`+url+`"}`, ty)
}
func (s *scriptScraper) scrape(ctx context.Context, input string, ty models.ScrapeContentType) (models.ScrapedContent, error) {
switch ty {
case models.ScrapeContentTypePerformer:
var performer *models.ScrapedPerformer
err := s.runScraperScript(ctx, input, &performer)
return performer, err
case models.ScrapeContentTypeGallery:
var gallery *models.ScrapedGallery
err := s.runScraperScript(ctx, input, &gallery)
return gallery, err
case models.ScrapeContentTypeScene:
var scene *models.ScrapedScene
err := s.runScraperScript(ctx, input, &scene)
return scene, err
case models.ScrapeContentTypeMovie:
var movie *models.ScrapedMovie
err := s.runScraperScript(ctx, input, &movie)
return movie, err
}
return nil, ErrNotSupported
}
func (s *scriptScraper) scrapeSceneByScene(ctx context.Context, scene *models.Scene) (*models.ScrapedScene, error) {
inString, err := json.Marshal(sceneToUpdateInput(scene))
if err != nil {
return nil, err
}
var ret *models.ScrapedScene
err = s.runScraperScript(ctx, string(inString), &ret)
return ret, err
}
func (s *scriptScraper) scrapeGalleryByGallery(ctx context.Context, gallery *models.Gallery) (*models.ScrapedGallery, error) {
inString, err := json.Marshal(galleryToUpdateInput(gallery))
if err != nil {
return nil, err
}
var ret *models.ScrapedGallery
err = s.runScraperScript(ctx, string(inString), &ret)
return ret, err
}
func handleScraperStderr(name string, scraperOutputReader io.ReadCloser) {
const scraperPrefix = "[Scrape / %s] "
lgr := logger.PluginLogger{
Logger: logger.Logger,
Prefix: fmt.Sprintf(scraperPrefix, name),
DefaultLogLevel: &logger.ErrorLevel,
}
lgr.ReadLogMessages(scraperOutputReader)
}