stash/pkg/scraper/url.go
priv-r8s 7d44ea88eb Add exponential backoff for HTTP 429 rate limiting in scrapers
- Backoff delay = Retry-After + exponential (2s, 4s, 8s, ...)
- If Retry-After exceeds 60s max, give up immediately
- Respects Retry-After header as floor, adds incremental backoff
- Comprehensive unit tests for all backoff paths

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 15:16:24 -07:00

433 lines
12 KiB
Go

package scraper
import (
"bytes"
"context"
"fmt"
"io"
"net"
"net/http"
"net/url"
"os"
"regexp"
"strconv"
"strings"
"time"
"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/fetch"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
jsoniter "github.com/json-iterator/go"
"golang.org/x/net/html/charset"
"github.com/stashapp/stash/pkg/logger"
)
const scrapeDefaultSleep = time.Second * 2
const (
// maxRateLimitRetries is the maximum number of retries when receiving HTTP 429 responses.
maxRateLimitRetries = 5
// rateLimitBaseDelay is the initial backoff delay for 429 retries.
rateLimitBaseDelay = time.Second * 2
// rateLimitMaxDelay caps the exponential backoff to prevent excessively long waits.
rateLimitMaxDelay = time.Minute
// rateLimitTotalTimeout bounds the total wall-clock time for a single loadURL call
// including all retry delays, so that rate-limit retries don't run indefinitely.
rateLimitTotalTimeout = 5 * time.Minute
)
func loadURL(ctx context.Context, loadURL string, client *http.Client, def Definition, globalConfig GlobalConfig) (io.Reader, error) {
driverOptions := def.DriverOptions
if driverOptions != nil && driverOptions.UseCDP {
return urlFromCDP(ctx, loadURL, *driverOptions, globalConfig)
}
jar, err := def.jar()
if err != nil {
return nil, fmt.Errorf("error creating cookie jar: %w", err)
}
u, err := url.Parse(loadURL)
if err != nil {
return nil, fmt.Errorf("error parsing url %s: %w", loadURL, err)
}
userAgent := globalConfig.GetScraperUserAgent()
// Apply an overall deadline so retry delays don't run indefinitely.
ctx, cancel := context.WithTimeout(ctx, rateLimitTotalTimeout)
defer cancel()
for attempt := 0; ; attempt++ {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, loadURL, nil)
if err != nil {
return nil, err
}
// Fetch relevant cookies from the jar for url u and add them to the request
cookies := jar.Cookies(u)
for _, cookie := range cookies {
req.AddCookie(cookie)
}
if userAgent != "" {
req.Header.Set("User-Agent", userAgent)
}
if driverOptions != nil { // setting the Headers after the UA allows us to override it from inside the scraper
for _, h := range driverOptions.Headers {
if h.Key != "" {
req.Header.Set(h.Key, h.Value)
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
}
}
}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
if resp.StatusCode == http.StatusTooManyRequests {
resp.Body.Close()
// attempt counts from 0: attempt 0 is the initial request,
// attempts 1..maxRateLimitRetries are retries.
if attempt >= maxRateLimitRetries {
logger.Warnf("[scraper] rate limited on %s, all %d retries exhausted", loadURL, maxRateLimitRetries)
return nil, &HTTPError{StatusCode: resp.StatusCode}
}
delay := rateLimitBackoff(resp, attempt)
if delay < 0 {
logger.Warnf("[scraper] rate limited on %s, server requested wait exceeds maximum", loadURL)
return nil, &HTTPError{StatusCode: resp.StatusCode}
}
logger.Infof("[scraper] rate limited on %s (retry %d/%d), waiting %v", loadURL, attempt+1, maxRateLimitRetries, delay)
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(delay):
continue
}
}
if resp.StatusCode >= 400 {
resp.Body.Close()
return nil, &HTTPError{StatusCode: resp.StatusCode}
}
body, err := io.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
return nil, err
}
bodyReader := bytes.NewReader(body)
printCookies(jar, def, "Jar cookies found for scraper urls")
return charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
}
}
// rateLimitBackoff calculates the delay before retrying a rate-limited request.
// The delay is the sum of the parsed Retry-After value (defaulting to
// rateLimitBaseDelay when absent) and an exponential backoff (2s, 4s, 8s, ...,
// capped at rateLimitMaxDelay). Returns -1 if the server's Retry-After exceeds
// rateLimitMaxDelay, signalling that the caller should stop retrying.
func rateLimitBackoff(resp *http.Response, attempt int) time.Duration {
retryAfter := parseRetryAfter(resp)
// If the server asks us to wait longer than our max, give up immediately.
if retryAfter > rateLimitMaxDelay {
return -1
}
// Exponential backoff: 2s, 4s, 8s, 16s, 32s, ...
// Guard against int64 overflow for large attempt values.
if attempt >= 30 {
return rateLimitMaxDelay
}
backoff := rateLimitBaseDelay << uint(attempt)
return clampDelay(retryAfter + backoff)
}
// parseRetryAfter extracts a duration from the Retry-After header.
// Returns rateLimitBaseDelay if the header is absent or unparseable.
func parseRetryAfter(resp *http.Response) time.Duration {
retryAfter := resp.Header.Get("Retry-After")
if retryAfter == "" {
return rateLimitBaseDelay
}
// Try parsing as seconds
if seconds, err := strconv.Atoi(retryAfter); err == nil && seconds >= 0 {
return time.Duration(seconds) * time.Second
}
// Try parsing as HTTP-date
if t, err := http.ParseTime(retryAfter); err == nil {
if d := time.Until(t); d > 0 {
return d
}
}
return rateLimitBaseDelay
}
// clampDelay caps a duration to rateLimitMaxDelay.
func clampDelay(d time.Duration) time.Duration {
if d > rateLimitMaxDelay {
return rateLimitMaxDelay
}
return d
}
// func urlFromCDP uses chrome cdp and DOM to load and process the url
// if remote is set as true in the scraperConfig it will try to use localhost:9222
// else it will look for google-chrome in path
func urlFromCDP(ctx context.Context, urlCDP string, driverOptions scraperDriverOptions, globalConfig GlobalConfig) (io.Reader, error) {
if !driverOptions.UseCDP {
return nil, fmt.Errorf("url shouldn't be fetched through CDP")
}
sleepDuration := scrapeDefaultSleep
if driverOptions.Sleep > 0 {
sleepDuration = time.Duration(driverOptions.Sleep) * time.Second
}
// if scraperCDPPath is a remote address, then allocate accordingly
cdpPath := globalConfig.GetScraperCDPPath()
if cdpPath != "" {
var cancelAct context.CancelFunc
if isCDPPathHTTP(globalConfig) || isCDPPathWS(globalConfig) {
remote := cdpPath
// -------------------------------------------------------------------
// #1023
// when chromium is listening over RDP it only accepts requests
// with host headers that are either IPs or `localhost`
cdpURL, err := url.Parse(remote)
if err != nil {
return nil, fmt.Errorf("failed to parse CDP Path: %v", err)
}
hostname := cdpURL.Hostname()
if hostname != "localhost" {
if net.ParseIP(hostname) == nil { // not an IP
addr, err := net.LookupIP(hostname)
if err != nil || len(addr) == 0 { // can not resolve to IP
return nil, fmt.Errorf("CDP: hostname <%s> can not be resolved", hostname)
}
if len(addr[0]) == 0 { // nil IP
return nil, fmt.Errorf("CDP: hostname <%s> resolved to nil", hostname)
}
// addr is a valid IP
// replace the host part of the cdpURL with the IP
cdpURL.Host = strings.Replace(cdpURL.Host, hostname, addr[0].String(), 1)
// use that for remote
remote = cdpURL.String()
}
}
// --------------------------------------------------------------------
// if CDPPath is http(s) then we need to get the websocket URL
if isCDPPathHTTP(globalConfig) {
var err error
remote, err = getRemoteCDPWSAddress(ctx, remote)
if err != nil {
return nil, err
}
}
ctx, cancelAct = chromedp.NewRemoteAllocator(ctx, remote)
} else {
// use a temporary user directory for chrome
dir, err := os.MkdirTemp("", "stash-chromedp")
if err != nil {
return nil, err
}
defer os.RemoveAll(dir)
opts := append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.UserDataDir(dir),
chromedp.ExecPath(cdpPath),
)
if globalConfig.GetProxy() != "" {
url, _, _ := splitProxyAuth(globalConfig.GetProxy())
opts = append(opts, chromedp.ProxyServer(url))
}
ctx, cancelAct = chromedp.NewExecAllocator(ctx, opts...)
}
defer cancelAct()
}
ctx, cancel := chromedp.NewContext(ctx)
defer cancel()
// add a fixed timeout for the http request
ctx, cancel = context.WithTimeout(ctx, scrapeGetTimeout)
defer cancel()
var res string
headers := cdpHeaders(driverOptions)
if proxyUsesAuth(globalConfig.GetProxy()) {
_, user, pass := splitProxyAuth(globalConfig.GetProxy())
// Based on https://github.com/chromedp/examples/blob/master/proxy/main.go
lctx, lcancel := context.WithCancel(ctx)
chromedp.ListenTarget(lctx, func(ev interface{}) {
switch ev := ev.(type) {
case *fetch.EventRequestPaused:
go func() {
_ = chromedp.Run(ctx, fetch.ContinueRequest(ev.RequestID))
}()
case *fetch.EventAuthRequired:
if ev.AuthChallenge.Source == fetch.AuthChallengeSourceProxy {
go func() {
_ = chromedp.Run(ctx,
fetch.ContinueWithAuth(ev.RequestID, &fetch.AuthChallengeResponse{
Response: fetch.AuthChallengeResponseResponseProvideCredentials,
Username: user,
Password: pass,
}),
// Chrome will remember the credential for the current instance,
// so we can disable the fetch domain once credential is provided.
// Please file an issue if Chrome does not work in this way.
fetch.Disable(),
)
// and cancel the event handler too.
lcancel()
}()
}
}
})
}
err := chromedp.Run(ctx,
network.Enable(),
setCDPCookies(driverOptions),
printCDPCookies(driverOptions, "Cookies found"),
network.SetExtraHTTPHeaders(network.Headers(headers)),
chromedp.Navigate(urlCDP),
chromedp.Sleep(sleepDuration),
setCDPClicks(driverOptions),
chromedp.OuterHTML("html", &res, chromedp.ByQuery),
printCDPCookies(driverOptions, "Cookies set"),
)
if err != nil {
return nil, err
}
return strings.NewReader(res), nil
}
// click all xpaths listed in the scraper config
func setCDPClicks(driverOptions scraperDriverOptions) chromedp.Tasks {
var tasks chromedp.Tasks
for _, click := range driverOptions.Clicks { // for each click element find the node from the xpath and add a click action
if click.XPath != "" {
xpath := click.XPath
waitDuration := scrapeDefaultSleep
if click.Sleep > 0 {
waitDuration = time.Duration(click.Sleep) * time.Second
}
action := chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Nodes(xpath, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
logger.Debugf("Error %s looking for click xpath %s.\n", err, xpath)
return err
}
if len(nodes) == 0 {
logger.Debugf("Click xpath %s not found in page.\n", xpath)
return nil
}
logger.Debugf("Clicking %s\n", xpath)
return chromedp.MouseClickNode(nodes[0]).Do(ctx)
})
tasks = append(tasks, action)
tasks = append(tasks, chromedp.Sleep(waitDuration))
}
}
return tasks
}
// getRemoteCDPWSAddress returns the complete remote address that is required to access the cdp instance
func getRemoteCDPWSAddress(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return "", err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
var result map[string]interface{}
var json = jsoniter.ConfigCompatibleWithStandardLibrary
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return "", err
}
remote := result["webSocketDebuggerUrl"].(string)
logger.Debugf("Remote cdp instance found %s", remote)
return remote, err
}
func cdpHeaders(driverOptions scraperDriverOptions) map[string]interface{} {
headers := map[string]interface{}{}
if driverOptions.Headers != nil {
for _, h := range driverOptions.Headers {
if h.Key != "" {
headers[h.Key] = h.Value
logger.Debugf("[scraper] adding header <%s:%s>", h.Key, h.Value)
}
}
}
return headers
}
func proxyUsesAuth(proxyUrl string) bool {
if proxyUrl == "" {
return false
}
reg := regexp.MustCompile(`^(https?:\/\/)(([\P{Cc}]+):([\P{Cc}]+)@)?(([a-zA-Z0-9][a-zA-Z0-9.-]*)(:[0-9]{1,5})?)`)
matches := reg.FindAllStringSubmatch(proxyUrl, -1)
if matches != nil {
split := matches[0]
return len(split) == 0 || (len(split) > 5 && split[3] != "")
}
return false
}
func splitProxyAuth(proxyUrl string) (string, string, string) {
if proxyUrl == "" {
return "", "", ""
}
reg := regexp.MustCompile(`^(https?:\/\/)(([\P{Cc}]+):([\P{Cc}]+)@)?(([a-zA-Z0-9][a-zA-Z0-9.-]*)(:[0-9]{1,5})?)`)
matches := reg.FindAllStringSubmatch(proxyUrl, -1)
if matches != nil && len(matches[0]) > 5 {
split := matches[0]
return split[1] + split[5], split[3], split[4]
}
return proxyUrl, "", ""
}