stash/pkg/scraper/url.go
SmallCoccinelle e513b6ffa5
Cache and reuse the scraper HTTP client (#1855)
* Add Cookies directly to the request

Rather than maintaining a cookie jar on a one-shot HTTP client, maintain
the jar ourselves: make a new jar, then use it to select the right
cookies.

The cookies are set on the request rather than on the client. This will
retain the current behavior as we are always throwing the client away
after each use.

This patch enables the lifting of the http client as well over time.

* Introduce a cached scraper HTTP client

The scraper cache is augmented with an *http.Client. These are safe for
concurrent use, so the pointer can safely be passed around. Push this
into scraper configurations where applicable, next to the txnManagers.

When we issue a loadUrl request, do so on the cached *http.Client,
which will reuse existing idle connections in the client if any are
present.

* Set MaxIdleConnsPerHost. Closes #1850

We allow for up to 8 idle connections to a single host. This should
make concurrent operation toward the same host reuse connections, even
for sizeable concurrency.

The number isn't bumped excessively high. We should probably limit
concurrency toward a single site anyway, since we'll be able to overrun
a site with queries quite easily if we have many concurrent goroutines
issuing requests at the same time.

* Reinstate driverOptions / useCDP check

Use DeMorgan's laws to invert the logic and exit early. Fixes tests
breaking.

* Documentation fixup.

* Use the scraper http.Client when fetching images

Fold image fetchers onto the cached scraper http.Client as well. This
makes the scraper have a single http.Client cache for all its
operations.

Thread the client upwards to the relevant attachment points: either the
cache, or a stash_box instance, which is extended to include a pointer
to the client.

Style roughly follows that of txnManagers.

* Use the same http Client as the GraphQL client use

Rather than using http.DefaultClient, use the same client as the
GraphQL client use in the stash_box subsystem. This localizes the
client used in the subsystem into the constructing New.. call.

* Hoist HTTP client construction

Create a function for initializaing the HTTP Client we use. While here
hoist magic numbers into constants. Introduce a proper static redirect
error and use it in the client code as well.

* Reinstate printCookies

This is a debugging function, and it might still come in handy in the
future at some point.

* Nitpick comment.

* Minor tidy

Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
2021-10-20 16:12:24 +11:00

236 lines
6.3 KiB
Go

package scraper
import (
"bytes"
"context"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"time"
"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
jsoniter "github.com/json-iterator/go"
"golang.org/x/net/html/charset"
"github.com/stashapp/stash/pkg/logger"
)
const scrapeDefaultSleep = time.Second * 2
func loadURL(ctx context.Context, loadURL string, client *http.Client, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) {
driverOptions := scraperConfig.DriverOptions
if driverOptions != nil && driverOptions.UseCDP {
// get the page using chrome dp
return urlFromCDP(ctx, loadURL, *driverOptions, globalConfig)
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, loadURL, nil)
if err != nil {
return nil, err
}
jar, err := scraperConfig.jar()
if err != nil {
return nil, fmt.Errorf("error creating cookie jar: %w", err)
}
u, err := url.Parse(loadURL)
if err != nil {
return nil, fmt.Errorf("error parsing url %s: %w", loadURL, err)
}
// Fetch relevant cookies from the jar for url u and add them to the request
cookies := jar.Cookies(u)
for _, cookie := range cookies {
req.AddCookie(cookie)
}
userAgent := globalConfig.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
if driverOptions != nil { // setting the Headers after the UA allows us to override it from inside the scraper
for _, h := range driverOptions.Headers {
if h.Key != "" {
req.Header.Set(h.Key, h.Value)
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
}
}
}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("http error %d:%s", resp.StatusCode, http.StatusText(resp.StatusCode))
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
bodyReader := bytes.NewReader(body)
printCookies(jar, scraperConfig, "Jar cookies found for scraper urls")
return charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
}
// func urlFromCDP uses chrome cdp and DOM to load and process the url
// if remote is set as true in the scraperConfig it will try to use localhost:9222
// else it will look for google-chrome in path
func urlFromCDP(ctx context.Context, url string, driverOptions scraperDriverOptions, globalConfig GlobalConfig) (io.Reader, error) {
if !driverOptions.UseCDP {
return nil, fmt.Errorf("url shouldn't be fetched through CDP")
}
sleepDuration := scrapeDefaultSleep
if driverOptions.Sleep > 0 {
sleepDuration = time.Duration(driverOptions.Sleep) * time.Second
}
act := context.TODO()
// if scraperCDPPath is a remote address, then allocate accordingly
cdpPath := globalConfig.GetScraperCDPPath()
if cdpPath != "" {
var cancelAct context.CancelFunc
if isCDPPathHTTP(globalConfig) || isCDPPathWS(globalConfig) {
remote := cdpPath
// if CDPPath is http(s) then we need to get the websocket URL
if isCDPPathHTTP(globalConfig) {
var err error
remote, err = getRemoteCDPWSAddress(ctx, remote)
if err != nil {
return nil, err
}
}
act, cancelAct = chromedp.NewRemoteAllocator(act, remote)
} else {
// use a temporary user directory for chrome
dir, err := os.MkdirTemp("", "stash-chromedp")
if err != nil {
return nil, err
}
defer os.RemoveAll(dir)
opts := append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.UserDataDir(dir),
chromedp.ExecPath(cdpPath),
)
act, cancelAct = chromedp.NewExecAllocator(act, opts...)
}
defer cancelAct()
}
ctx, cancel := chromedp.NewContext(act)
defer cancel()
// add a fixed timeout for the http request
ctx, cancel = context.WithTimeout(ctx, scrapeGetTimeout)
defer cancel()
var res string
headers := cdpHeaders(driverOptions)
err := chromedp.Run(ctx,
network.Enable(),
setCDPCookies(driverOptions),
printCDPCookies(driverOptions, "Cookies found"),
network.SetExtraHTTPHeaders(network.Headers(headers)),
chromedp.Navigate(url),
chromedp.Sleep(sleepDuration),
setCDPClicks(driverOptions),
chromedp.OuterHTML("html", &res, chromedp.ByQuery),
printCDPCookies(driverOptions, "Cookies set"),
)
if err != nil {
return nil, err
}
return strings.NewReader(res), nil
}
// click all xpaths listed in the scraper config
func setCDPClicks(driverOptions scraperDriverOptions) chromedp.Tasks {
var tasks chromedp.Tasks
for _, click := range driverOptions.Clicks { // for each click element find the node from the xpath and add a click action
if click.XPath != "" {
xpath := click.XPath
waitDuration := scrapeDefaultSleep
if click.Sleep > 0 {
waitDuration = time.Duration(click.Sleep) * time.Second
}
action := chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Nodes(xpath, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
logger.Debugf("Error %s looking for click xpath %s.\n", err, xpath)
return err
}
if len(nodes) == 0 {
logger.Debugf("Click xpath %s not found in page.\n", xpath)
return nil
}
logger.Debugf("Clicking %s\n", xpath)
return chromedp.MouseClickNode(nodes[0]).Do(ctx)
})
tasks = append(tasks, action)
tasks = append(tasks, chromedp.Sleep(waitDuration))
}
}
return tasks
}
// getRemoteCDPWSAddress returns the complete remote address that is required to access the cdp instance
func getRemoteCDPWSAddress(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return "", err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
var result map[string]interface{}
var json = jsoniter.ConfigCompatibleWithStandardLibrary
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return "", err
}
remote := result["webSocketDebuggerUrl"].(string)
logger.Debugf("Remote cdp instance found %s", remote)
return remote, err
}
func cdpHeaders(driverOptions scraperDriverOptions) map[string]interface{} {
headers := map[string]interface{}{}
if driverOptions.Headers != nil {
for _, h := range driverOptions.Headers {
if h.Key != "" {
headers[h.Key] = h.Value
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
}
}
}
return headers
}