mirror of
https://github.com/stashapp/stash.git
synced 2025-12-06 16:34:02 +01:00
* Push scrapeByURL into scrapers Replace ScrapePerfomerByURL, ScrapeMovie..., ... with ScrapeByURL in the scraperActionImpl interface. This allows us to delete a lot of repeated code in the scrapers and replace the central part with a switch on the scraper type. * Fold name scraping into one call Follow up on scraper refactoring. Name scrapers use the same code path. This allows us to restructure some code and kill some functions, adding variance to the name scraping code. It allows us to remove some code repetition as well. * Do not export loop refs. * Simplify fragment scraping Generalize fragment scrapers into ScrapeByFragment. This simplifies fragment code flows into a simpler pathing which should be easier to handle in the future. * Eliminate more context.TODO() In a number of cases, we have a context now. Use the context rather than TODO() for those cases in order to make those operations cancellable. * Pass the context for the stashbox scraper This removes all context.TODO() in the path of the stashbox scraper, and replaces it with the context that's present on each of the paths. * Pass the context into subscrapers Mostly a mechanical update, where we pass in the context for subscraping. This removes the final context.TODO() in the scraper code. * Warn on unknown fields from scripts A common mistake for new script writers are that they return fields not known to stash. For instance the name "description" is used rather than "details". Decode disallowing unknown fields. If this fails, use a tee-reader to fall back to the old behavior, but print a warning for the user in this case. Thus, we retain the old behavior, but print warnings for scripts which fails the more strict unknown-fields detection. * Nil-check before running the postprocessing chain Fixes panics when scraping returns nil values. * Lift nil-ness in post-postprocessing If the struct we are trying to post-process is nil, we shouldn't enter the postprocessing flow at all. Pass the struct as a value rather than a pointer, eliminating nil-checks as we go. Use the top-level postProcess call to make the nil-check and then abort there if the object we are looking at is nil. * Allow conversion routines to handle values If we have a non-pointer type in the interface, we should also convert those into ScrapedContent. Otherwise we get errors on deprecated functions.
234 lines
6.3 KiB
Go
234 lines
6.3 KiB
Go
package scraper
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/chromedp/cdproto/cdp"
|
|
"github.com/chromedp/cdproto/network"
|
|
"github.com/chromedp/chromedp"
|
|
jsoniter "github.com/json-iterator/go"
|
|
"golang.org/x/net/html/charset"
|
|
|
|
"github.com/stashapp/stash/pkg/logger"
|
|
)
|
|
|
|
const scrapeDefaultSleep = time.Second * 2
|
|
|
|
func loadURL(ctx context.Context, loadURL string, client *http.Client, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) {
|
|
driverOptions := scraperConfig.DriverOptions
|
|
if driverOptions != nil && driverOptions.UseCDP {
|
|
// get the page using chrome dp
|
|
return urlFromCDP(ctx, loadURL, *driverOptions, globalConfig)
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, loadURL, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
jar, err := scraperConfig.jar()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error creating cookie jar: %w", err)
|
|
}
|
|
|
|
u, err := url.Parse(loadURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error parsing url %s: %w", loadURL, err)
|
|
}
|
|
|
|
// Fetch relevant cookies from the jar for url u and add them to the request
|
|
cookies := jar.Cookies(u)
|
|
for _, cookie := range cookies {
|
|
req.AddCookie(cookie)
|
|
}
|
|
|
|
userAgent := globalConfig.GetScraperUserAgent()
|
|
if userAgent != "" {
|
|
req.Header.Set("User-Agent", userAgent)
|
|
}
|
|
|
|
if driverOptions != nil { // setting the Headers after the UA allows us to override it from inside the scraper
|
|
for _, h := range driverOptions.Headers {
|
|
if h.Key != "" {
|
|
req.Header.Set(h.Key, h.Value)
|
|
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
|
|
}
|
|
}
|
|
}
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if resp.StatusCode >= 400 {
|
|
return nil, fmt.Errorf("http error %d:%s", resp.StatusCode, http.StatusText(resp.StatusCode))
|
|
}
|
|
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
bodyReader := bytes.NewReader(body)
|
|
printCookies(jar, scraperConfig, "Jar cookies found for scraper urls")
|
|
return charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
|
}
|
|
|
|
// func urlFromCDP uses chrome cdp and DOM to load and process the url
|
|
// if remote is set as true in the scraperConfig it will try to use localhost:9222
|
|
// else it will look for google-chrome in path
|
|
func urlFromCDP(ctx context.Context, url string, driverOptions scraperDriverOptions, globalConfig GlobalConfig) (io.Reader, error) {
|
|
|
|
if !driverOptions.UseCDP {
|
|
return nil, fmt.Errorf("url shouldn't be fetched through CDP")
|
|
}
|
|
|
|
sleepDuration := scrapeDefaultSleep
|
|
|
|
if driverOptions.Sleep > 0 {
|
|
sleepDuration = time.Duration(driverOptions.Sleep) * time.Second
|
|
}
|
|
|
|
// if scraperCDPPath is a remote address, then allocate accordingly
|
|
cdpPath := globalConfig.GetScraperCDPPath()
|
|
if cdpPath != "" {
|
|
var cancelAct context.CancelFunc
|
|
|
|
if isCDPPathHTTP(globalConfig) || isCDPPathWS(globalConfig) {
|
|
remote := cdpPath
|
|
|
|
// if CDPPath is http(s) then we need to get the websocket URL
|
|
if isCDPPathHTTP(globalConfig) {
|
|
var err error
|
|
remote, err = getRemoteCDPWSAddress(ctx, remote)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
ctx, cancelAct = chromedp.NewRemoteAllocator(ctx, remote)
|
|
} else {
|
|
// use a temporary user directory for chrome
|
|
dir, err := os.MkdirTemp("", "stash-chromedp")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer os.RemoveAll(dir)
|
|
|
|
opts := append(chromedp.DefaultExecAllocatorOptions[:],
|
|
chromedp.UserDataDir(dir),
|
|
chromedp.ExecPath(cdpPath),
|
|
)
|
|
ctx, cancelAct = chromedp.NewExecAllocator(ctx, opts...)
|
|
}
|
|
|
|
defer cancelAct()
|
|
}
|
|
|
|
ctx, cancel := chromedp.NewContext(ctx)
|
|
defer cancel()
|
|
|
|
// add a fixed timeout for the http request
|
|
ctx, cancel = context.WithTimeout(ctx, scrapeGetTimeout)
|
|
defer cancel()
|
|
|
|
var res string
|
|
headers := cdpHeaders(driverOptions)
|
|
|
|
err := chromedp.Run(ctx,
|
|
network.Enable(),
|
|
setCDPCookies(driverOptions),
|
|
printCDPCookies(driverOptions, "Cookies found"),
|
|
network.SetExtraHTTPHeaders(network.Headers(headers)),
|
|
chromedp.Navigate(url),
|
|
chromedp.Sleep(sleepDuration),
|
|
setCDPClicks(driverOptions),
|
|
chromedp.OuterHTML("html", &res, chromedp.ByQuery),
|
|
printCDPCookies(driverOptions, "Cookies set"),
|
|
)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return strings.NewReader(res), nil
|
|
}
|
|
|
|
// click all xpaths listed in the scraper config
|
|
func setCDPClicks(driverOptions scraperDriverOptions) chromedp.Tasks {
|
|
var tasks chromedp.Tasks
|
|
for _, click := range driverOptions.Clicks { // for each click element find the node from the xpath and add a click action
|
|
if click.XPath != "" {
|
|
xpath := click.XPath
|
|
waitDuration := scrapeDefaultSleep
|
|
if click.Sleep > 0 {
|
|
waitDuration = time.Duration(click.Sleep) * time.Second
|
|
}
|
|
|
|
action := chromedp.ActionFunc(func(ctx context.Context) error {
|
|
var nodes []*cdp.Node
|
|
if err := chromedp.Nodes(xpath, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
|
|
logger.Debugf("Error %s looking for click xpath %s.\n", err, xpath)
|
|
return err
|
|
}
|
|
if len(nodes) == 0 {
|
|
logger.Debugf("Click xpath %s not found in page.\n", xpath)
|
|
return nil
|
|
}
|
|
logger.Debugf("Clicking %s\n", xpath)
|
|
return chromedp.MouseClickNode(nodes[0]).Do(ctx)
|
|
})
|
|
|
|
tasks = append(tasks, action)
|
|
tasks = append(tasks, chromedp.Sleep(waitDuration))
|
|
}
|
|
|
|
}
|
|
return tasks
|
|
}
|
|
|
|
// getRemoteCDPWSAddress returns the complete remote address that is required to access the cdp instance
|
|
func getRemoteCDPWSAddress(ctx context.Context, url string) (string, error) {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
resp, err := http.DefaultClient.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
var result map[string]interface{}
|
|
var json = jsoniter.ConfigCompatibleWithStandardLibrary
|
|
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
return "", err
|
|
}
|
|
remote := result["webSocketDebuggerUrl"].(string)
|
|
logger.Debugf("Remote cdp instance found %s", remote)
|
|
return remote, err
|
|
}
|
|
|
|
func cdpHeaders(driverOptions scraperDriverOptions) map[string]interface{} {
|
|
headers := map[string]interface{}{}
|
|
if driverOptions.Headers != nil {
|
|
for _, h := range driverOptions.Headers {
|
|
if h.Key != "" {
|
|
headers[h.Key] = h.Value
|
|
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
|
|
}
|
|
}
|
|
}
|
|
return headers
|
|
}
|