package scraper import ( "bytes" "context" "fmt" "io" "net" "net/http" "net/url" "os" "regexp" "strconv" "strings" "time" "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/fetch" "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" jsoniter "github.com/json-iterator/go" "golang.org/x/net/html/charset" "github.com/stashapp/stash/pkg/logger" ) const scrapeDefaultSleep = time.Second * 2 const ( // maxRateLimitRetries is the maximum number of retries when receiving HTTP 429 responses. maxRateLimitRetries = 5 // rateLimitBaseDelay is the initial backoff delay for 429 retries. rateLimitBaseDelay = time.Second * 2 // rateLimitMaxDelay caps the exponential backoff to prevent excessively long waits. rateLimitMaxDelay = time.Minute // rateLimitTotalTimeout bounds the total wall-clock time for a single loadURL call // including all retry delays, so that rate-limit retries don't run indefinitely. rateLimitTotalTimeout = 5 * time.Minute ) func loadURL(ctx context.Context, loadURL string, client *http.Client, def Definition, globalConfig GlobalConfig) (io.Reader, error) { driverOptions := def.DriverOptions if driverOptions != nil && driverOptions.UseCDP { return urlFromCDP(ctx, loadURL, *driverOptions, globalConfig) } jar, err := def.jar() if err != nil { return nil, fmt.Errorf("error creating cookie jar: %w", err) } u, err := url.Parse(loadURL) if err != nil { return nil, fmt.Errorf("error parsing url %s: %w", loadURL, err) } userAgent := globalConfig.GetScraperUserAgent() // Apply an overall deadline so retry delays don't run indefinitely. ctx, cancel := context.WithTimeout(ctx, rateLimitTotalTimeout) defer cancel() for attempt := 0; ; attempt++ { req, err := http.NewRequestWithContext(ctx, http.MethodGet, loadURL, nil) if err != nil { return nil, err } // Fetch relevant cookies from the jar for url u and add them to the request cookies := jar.Cookies(u) for _, cookie := range cookies { req.AddCookie(cookie) } if userAgent != "" { req.Header.Set("User-Agent", userAgent) } if driverOptions != nil { // setting the Headers after the UA allows us to override it from inside the scraper for _, h := range driverOptions.Headers { if h.Key != "" { req.Header.Set(h.Key, h.Value) logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value) } } } resp, err := client.Do(req) if err != nil { return nil, err } if resp.StatusCode == http.StatusTooManyRequests { resp.Body.Close() // attempt counts from 0: attempt 0 is the initial request, // attempts 1..maxRateLimitRetries are retries. if attempt >= maxRateLimitRetries { logger.Warnf("[scraper] rate limited on %s, all %d retries exhausted", loadURL, maxRateLimitRetries) return nil, &HTTPError{StatusCode: resp.StatusCode} } delay := rateLimitBackoff(resp, attempt) if delay < 0 { logger.Warnf("[scraper] rate limited on %s, server requested wait exceeds maximum", loadURL) return nil, &HTTPError{StatusCode: resp.StatusCode} } logger.Infof("[scraper] rate limited on %s (retry %d/%d), waiting %v", loadURL, attempt+1, maxRateLimitRetries, delay) select { case <-ctx.Done(): return nil, ctx.Err() case <-time.After(delay): continue } } if resp.StatusCode >= 400 { resp.Body.Close() return nil, &HTTPError{StatusCode: resp.StatusCode} } body, err := io.ReadAll(resp.Body) resp.Body.Close() if err != nil { return nil, err } bodyReader := bytes.NewReader(body) printCookies(jar, def, "Jar cookies found for scraper urls") return charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) } } // rateLimitBackoff calculates the delay before retrying a rate-limited request. // The delay is the sum of the parsed Retry-After value (defaulting to // rateLimitBaseDelay when absent) and an exponential backoff (2s, 4s, 8s, ..., // capped at rateLimitMaxDelay). Returns -1 if the server's Retry-After exceeds // rateLimitMaxDelay, signalling that the caller should stop retrying. func rateLimitBackoff(resp *http.Response, attempt int) time.Duration { retryAfter := parseRetryAfter(resp) // If the server asks us to wait longer than our max, give up immediately. if retryAfter > rateLimitMaxDelay { return -1 } // Exponential backoff: 2s, 4s, 8s, 16s, 32s, ... // Guard against int64 overflow for large attempt values. if attempt >= 30 { return rateLimitMaxDelay } backoff := rateLimitBaseDelay << uint(attempt) return clampDelay(retryAfter + backoff) } // parseRetryAfter extracts a duration from the Retry-After header. // Returns rateLimitBaseDelay if the header is absent or unparseable. func parseRetryAfter(resp *http.Response) time.Duration { retryAfter := resp.Header.Get("Retry-After") if retryAfter == "" { return rateLimitBaseDelay } // Try parsing as seconds if seconds, err := strconv.Atoi(retryAfter); err == nil && seconds >= 0 { return time.Duration(seconds) * time.Second } // Try parsing as HTTP-date if t, err := http.ParseTime(retryAfter); err == nil { if d := time.Until(t); d > 0 { return d } } return rateLimitBaseDelay } // clampDelay caps a duration to rateLimitMaxDelay. func clampDelay(d time.Duration) time.Duration { if d > rateLimitMaxDelay { return rateLimitMaxDelay } return d } // func urlFromCDP uses chrome cdp and DOM to load and process the url // if remote is set as true in the scraperConfig it will try to use localhost:9222 // else it will look for google-chrome in path func urlFromCDP(ctx context.Context, urlCDP string, driverOptions scraperDriverOptions, globalConfig GlobalConfig) (io.Reader, error) { if !driverOptions.UseCDP { return nil, fmt.Errorf("url shouldn't be fetched through CDP") } sleepDuration := scrapeDefaultSleep if driverOptions.Sleep > 0 { sleepDuration = time.Duration(driverOptions.Sleep) * time.Second } // if scraperCDPPath is a remote address, then allocate accordingly cdpPath := globalConfig.GetScraperCDPPath() if cdpPath != "" { var cancelAct context.CancelFunc if isCDPPathHTTP(globalConfig) || isCDPPathWS(globalConfig) { remote := cdpPath // ------------------------------------------------------------------- // #1023 // when chromium is listening over RDP it only accepts requests // with host headers that are either IPs or `localhost` cdpURL, err := url.Parse(remote) if err != nil { return nil, fmt.Errorf("failed to parse CDP Path: %v", err) } hostname := cdpURL.Hostname() if hostname != "localhost" { if net.ParseIP(hostname) == nil { // not an IP addr, err := net.LookupIP(hostname) if err != nil || len(addr) == 0 { // can not resolve to IP return nil, fmt.Errorf("CDP: hostname <%s> can not be resolved", hostname) } if len(addr[0]) == 0 { // nil IP return nil, fmt.Errorf("CDP: hostname <%s> resolved to nil", hostname) } // addr is a valid IP // replace the host part of the cdpURL with the IP cdpURL.Host = strings.Replace(cdpURL.Host, hostname, addr[0].String(), 1) // use that for remote remote = cdpURL.String() } } // -------------------------------------------------------------------- // if CDPPath is http(s) then we need to get the websocket URL if isCDPPathHTTP(globalConfig) { var err error remote, err = getRemoteCDPWSAddress(ctx, remote) if err != nil { return nil, err } } ctx, cancelAct = chromedp.NewRemoteAllocator(ctx, remote) } else { // use a temporary user directory for chrome dir, err := os.MkdirTemp("", "stash-chromedp") if err != nil { return nil, err } defer os.RemoveAll(dir) opts := append(chromedp.DefaultExecAllocatorOptions[:], chromedp.UserDataDir(dir), chromedp.ExecPath(cdpPath), ) if globalConfig.GetProxy() != "" { url, _, _ := splitProxyAuth(globalConfig.GetProxy()) opts = append(opts, chromedp.ProxyServer(url)) } ctx, cancelAct = chromedp.NewExecAllocator(ctx, opts...) } defer cancelAct() } ctx, cancel := chromedp.NewContext(ctx) defer cancel() // add a fixed timeout for the http request ctx, cancel = context.WithTimeout(ctx, scrapeGetTimeout) defer cancel() var res string headers := cdpHeaders(driverOptions) if proxyUsesAuth(globalConfig.GetProxy()) { _, user, pass := splitProxyAuth(globalConfig.GetProxy()) // Based on https://github.com/chromedp/examples/blob/master/proxy/main.go lctx, lcancel := context.WithCancel(ctx) chromedp.ListenTarget(lctx, func(ev interface{}) { switch ev := ev.(type) { case *fetch.EventRequestPaused: go func() { _ = chromedp.Run(ctx, fetch.ContinueRequest(ev.RequestID)) }() case *fetch.EventAuthRequired: if ev.AuthChallenge.Source == fetch.AuthChallengeSourceProxy { go func() { _ = chromedp.Run(ctx, fetch.ContinueWithAuth(ev.RequestID, &fetch.AuthChallengeResponse{ Response: fetch.AuthChallengeResponseResponseProvideCredentials, Username: user, Password: pass, }), // Chrome will remember the credential for the current instance, // so we can disable the fetch domain once credential is provided. // Please file an issue if Chrome does not work in this way. fetch.Disable(), ) // and cancel the event handler too. lcancel() }() } } }) } err := chromedp.Run(ctx, network.Enable(), setCDPCookies(driverOptions), printCDPCookies(driverOptions, "Cookies found"), network.SetExtraHTTPHeaders(network.Headers(headers)), chromedp.Navigate(urlCDP), chromedp.Sleep(sleepDuration), setCDPClicks(driverOptions), chromedp.OuterHTML("html", &res, chromedp.ByQuery), printCDPCookies(driverOptions, "Cookies set"), ) if err != nil { return nil, err } return strings.NewReader(res), nil } // click all xpaths listed in the scraper config func setCDPClicks(driverOptions scraperDriverOptions) chromedp.Tasks { var tasks chromedp.Tasks for _, click := range driverOptions.Clicks { // for each click element find the node from the xpath and add a click action if click.XPath != "" { xpath := click.XPath waitDuration := scrapeDefaultSleep if click.Sleep > 0 { waitDuration = time.Duration(click.Sleep) * time.Second } action := chromedp.ActionFunc(func(ctx context.Context) error { var nodes []*cdp.Node if err := chromedp.Nodes(xpath, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil { logger.Debugf("Error %s looking for click xpath %s.\n", err, xpath) return err } if len(nodes) == 0 { logger.Debugf("Click xpath %s not found in page.\n", xpath) return nil } logger.Debugf("Clicking %s\n", xpath) return chromedp.MouseClickNode(nodes[0]).Do(ctx) }) tasks = append(tasks, action) tasks = append(tasks, chromedp.Sleep(waitDuration)) } } return tasks } // getRemoteCDPWSAddress returns the complete remote address that is required to access the cdp instance func getRemoteCDPWSAddress(ctx context.Context, url string) (string, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { return "", err } resp, err := http.DefaultClient.Do(req) if err != nil { return "", err } defer resp.Body.Close() var result map[string]interface{} var json = jsoniter.ConfigCompatibleWithStandardLibrary if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { return "", err } remote := result["webSocketDebuggerUrl"].(string) logger.Debugf("Remote cdp instance found %s", remote) return remote, err } func cdpHeaders(driverOptions scraperDriverOptions) map[string]interface{} { headers := map[string]interface{}{} if driverOptions.Headers != nil { for _, h := range driverOptions.Headers { if h.Key != "" { headers[h.Key] = h.Value logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value) } } } return headers } func proxyUsesAuth(proxyUrl string) bool { if proxyUrl == "" { return false } reg := regexp.MustCompile(`^(https?:\/\/)(([\P{Cc}]+):([\P{Cc}]+)@)?(([a-zA-Z0-9][a-zA-Z0-9.-]*)(:[0-9]{1,5})?)`) matches := reg.FindAllStringSubmatch(proxyUrl, -1) if matches != nil { split := matches[0] return len(split) == 0 || (len(split) > 5 && split[3] != "") } return false } func splitProxyAuth(proxyUrl string) (string, string, string) { if proxyUrl == "" { return "", "", "" } reg := regexp.MustCompile(`^(https?:\/\/)(([\P{Cc}]+):([\P{Cc}]+)@)?(([a-zA-Z0-9][a-zA-Z0-9.-]*)(:[0-9]{1,5})?)`) matches := reg.FindAllStringSubmatch(proxyUrl, -1) if matches != nil && len(matches[0]) > 5 { split := matches[0] return split[1] + split[5], split[3], split[4] } return proxyUrl, "", "" }