stash/pkg/scraper/url.go
SmallCoccinelle a5ca8fc678
Enable safe linters (#1786)
* Enable safe linters

Enable the linters dogsled, rowserrcheck, and sqlclosecheck.

These report no errors currently in the code base.

Enable misspell.

Misspell finds two spelling mistakes in comments, which are fixed by the
patch as well.

Add and sort linters which are relatively
safe to add over time. Comment them out for now.

* Close the response body

If we can get a HTTP response, it has a body which ought to be closed.

By doing so, we avoid potentially leaking connections.

* Enable the exportloopref linter

There are two places in the code with these warnings. Fix them while
enabling the linter.

* Remove redundant types in tests

If a slice already determines the type, the inner type declaration is
redundant. Remove the inner declarations.

* Mark autotag test cases as parallel

Autotag test cases is by far the outlier when it comes to test time.
While go test runs test cases in parallel,
it doesn't do so inside a given package, unless one marks the test cases
as parallel.

This change provides a significant speedup on a 8-core machine for test
runs.
2021-10-03 11:48:03 +11:00

249 lines
6.6 KiB
Go

package scraper
import (
"bytes"
"context"
"crypto/tls"
"errors"
"fmt"
"io"
"net/http"
"net/http/cookiejar"
"os"
"strings"
"time"
"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
jsoniter "github.com/json-iterator/go"
"golang.org/x/net/html/charset"
"golang.org/x/net/publicsuffix"
"github.com/stashapp/stash/pkg/logger"
)
// Timeout for the scrape http request. Includes transfer time. May want to make this
// configurable at some point.
const scrapeGetTimeout = time.Second * 60
const scrapeDefaultSleep = time.Second * 2
func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) {
driverOptions := scraperConfig.DriverOptions
if driverOptions != nil && driverOptions.UseCDP {
// get the page using chrome dp
return urlFromCDP(url, *driverOptions, globalConfig)
}
// get the page using http.Client
options := cookiejar.Options{
PublicSuffixList: publicsuffix.List,
}
jar, er := cookiejar.New(&options)
if er != nil {
return nil, er
}
setCookies(jar, scraperConfig)
printCookies(jar, scraperConfig, "Jar cookies set from scraper")
client := &http.Client{
Transport: &http.Transport{ // ignore insecure certificates
TLSClientConfig: &tls.Config{InsecureSkipVerify: !globalConfig.GetScraperCertCheck()},
},
Timeout: scrapeGetTimeout,
// defaultCheckRedirect code with max changed from 10 to 20
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 20 {
return errors.New("stopped after 20 redirects")
}
return nil
},
Jar: jar,
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
userAgent := globalConfig.GetScraperUserAgent()
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
if driverOptions != nil { // setting the Headers after the UA allows us to override it from inside the scraper
for _, h := range driverOptions.Headers {
if h.Key != "" {
req.Header.Set(h.Key, h.Value)
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
}
}
}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("http error %d:%s", resp.StatusCode, http.StatusText(resp.StatusCode))
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
bodyReader := bytes.NewReader(body)
printCookies(jar, scraperConfig, "Jar cookies found for scraper urls")
return charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
}
// func urlFromCDP uses chrome cdp and DOM to load and process the url
// if remote is set as true in the scraperConfig it will try to use localhost:9222
// else it will look for google-chrome in path
func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig GlobalConfig) (io.Reader, error) {
if !driverOptions.UseCDP {
return nil, fmt.Errorf("url shouldn't be fetched through CDP")
}
sleepDuration := scrapeDefaultSleep
if driverOptions.Sleep > 0 {
sleepDuration = time.Duration(driverOptions.Sleep) * time.Second
}
act := context.Background()
// if scraperCDPPath is a remote address, then allocate accordingly
cdpPath := globalConfig.GetScraperCDPPath()
if cdpPath != "" {
var cancelAct context.CancelFunc
if isCDPPathHTTP(globalConfig) || isCDPPathWS(globalConfig) {
remote := cdpPath
// if CDPPath is http(s) then we need to get the websocket URL
if isCDPPathHTTP(globalConfig) {
var err error
remote, err = getRemoteCDPWSAddress(remote)
if err != nil {
return nil, err
}
}
act, cancelAct = chromedp.NewRemoteAllocator(context.Background(), remote)
} else {
// use a temporary user directory for chrome
dir, err := os.MkdirTemp("", "stash-chromedp")
if err != nil {
return nil, err
}
defer os.RemoveAll(dir)
opts := append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.UserDataDir(dir),
chromedp.ExecPath(cdpPath),
)
act, cancelAct = chromedp.NewExecAllocator(act, opts...)
}
defer cancelAct()
}
ctx, cancel := chromedp.NewContext(act)
defer cancel()
// add a fixed timeout for the http request
ctx, cancel = context.WithTimeout(ctx, scrapeGetTimeout)
defer cancel()
var res string
headers := cdpHeaders(driverOptions)
err := chromedp.Run(ctx,
network.Enable(),
setCDPCookies(driverOptions),
printCDPCookies(driverOptions, "Cookies found"),
network.SetExtraHTTPHeaders(network.Headers(headers)),
chromedp.Navigate(url),
chromedp.Sleep(sleepDuration),
setCDPClicks(driverOptions),
chromedp.OuterHTML("html", &res, chromedp.ByQuery),
printCDPCookies(driverOptions, "Cookies set"),
)
if err != nil {
return nil, err
}
return strings.NewReader(res), nil
}
// click all xpaths listed in the scraper config
func setCDPClicks(driverOptions scraperDriverOptions) chromedp.Tasks {
var tasks chromedp.Tasks
for _, click := range driverOptions.Clicks { // for each click element find the node from the xpath and add a click action
if click.XPath != "" {
xpath := click.XPath
waitDuration := scrapeDefaultSleep
if click.Sleep > 0 {
waitDuration = time.Duration(click.Sleep) * time.Second
}
action := chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Nodes(xpath, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
logger.Debugf("Error %s looking for click xpath %s.\n", err, xpath)
return err
}
if len(nodes) == 0 {
logger.Debugf("Click xpath %s not found in page.\n", xpath)
return nil
}
logger.Debugf("Clicking %s\n", xpath)
return chromedp.MouseClickNode(nodes[0]).Do(ctx)
})
tasks = append(tasks, action)
tasks = append(tasks, chromedp.Sleep(waitDuration))
}
}
return tasks
}
// getRemoteCDPWSAddress returns the complete remote address that is required to access the cdp instance
func getRemoteCDPWSAddress(address string) (string, error) {
resp, err := http.Get(address)
if err != nil {
return "", err
}
defer resp.Body.Close()
var result map[string]interface{}
var json = jsoniter.ConfigCompatibleWithStandardLibrary
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return "", err
}
remote := result["webSocketDebuggerUrl"].(string)
logger.Debugf("Remote cdp instance found %s", remote)
return remote, err
}
func cdpHeaders(driverOptions scraperDriverOptions) map[string]interface{} {
headers := map[string]interface{}{}
if driverOptions.Headers != nil {
for _, h := range driverOptions.Headers {
if h.Key != "" {
headers[h.Key] = h.Value
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
}
}
}
return headers
}