stash/pkg/scraper/xpath.go
woodgen 4045ddf3e9
Implement scraping movies by URL (#709)
* api/urlbuilders/movie: Auto format.

* graphql+pkg+ui: Implement scraping movies by URL.

This patch implements the missing required boilerplate for scraping
movies by URL, using performers and scenes as a reference.

Although this patch contains a big chunck of ground work for enabling
scraping movies by fragment, the feature would require additional
changes to be completely implemented and was not tested.

* graphql+pkg+ui: Scrape movie studio.

Extends and corrects the movie model for the ability to store and
dereference studio IDs with received studio string from the scraper.
This was done with Scenes as a reference. For simplicity the duplication
of having `ScrapedMovieStudio` and `ScrapedSceneStudio` was kept, which
should probably be refactored to be the same type in the model in the
future.

* ui/movies: Add movie scrape dialog.

Adds possibility to update existing movie entries with the URL scraper.

For this the MovieScrapeDialog.tsx was implemented with Performers and
Scenes as a reference. In addition DurationUtils needs to be called one
time for converting seconds from the model to the string that is
displayed in the component. This seemed the least intrusive to me as it
kept a ScrapeResult<string> type compatible with ScrapedInputGroupRow.
2020-08-10 15:34:15 +10:00

218 lines
4.8 KiB
Go

package scraper
import (
"bytes"
"errors"
"net/url"
"regexp"
"strings"
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
)
type xpathScraper struct {
scraper scraperTypeConfig
config config
globalConfig GlobalConfig
}
func newXpathScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *xpathScraper {
return &xpathScraper{
scraper: scraper,
config: config,
globalConfig: globalConfig,
}
}
func (s *xpathScraper) getXpathScraper() *mappedScraper {
return s.config.XPathScrapers[s.scraper.Scraper]
}
func (s *xpathScraper) scrapeURL(url string) (*html.Node, *mappedScraper, error) {
scraper := s.getXpathScraper()
if scraper == nil {
return nil, nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
doc, err := s.loadURL(url)
if err != nil {
return nil, nil, err
}
return doc, scraper, nil
}
func (s *xpathScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
doc, scraper, err := s.scrapeURL(url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapePerformer(q)
}
func (s *xpathScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
doc, scraper, err := s.scrapeURL(url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeScene(q)
}
func (s *xpathScraper) scrapeMovieByURL(url string) (*models.ScrapedMovie, error) {
doc, scraper, err := s.scrapeURL(url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeMovie(q)
}
func (s *xpathScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) {
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
const placeholder = "{}"
// replace the placeholder string with the URL-escaped name
escapedName := url.QueryEscape(name)
url := s.scraper.QueryURL
url = strings.Replace(url, placeholder, escapedName, -1)
doc, err := s.loadURL(url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapePerformers(q)
}
func (s *xpathScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
return nil, errors.New("scrapePerformerByFragment not supported for xpath scraper")
}
func (s *xpathScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
storedScene, err := sceneFromUpdateFragment(scene)
if err != nil {
return nil, err
}
if storedScene == nil {
return nil, errors.New("no scene found")
}
// construct the URL
url := constructSceneURL(s.scraper.QueryURL, storedScene)
scraper := s.getXpathScraper()
if scraper == nil {
return nil, errors.New("xpath scraper with name " + s.scraper.Scraper + " not found in config")
}
doc, err := s.loadURL(url)
if err != nil {
return nil, err
}
q := s.getXPathQuery(doc)
return scraper.scrapeScene(q)
}
func (s *xpathScraper) loadURL(url string) (*html.Node, error) {
r, err := loadURL(url, s.config, s.globalConfig)
if err != nil {
return nil, err
}
ret, err := html.Parse(r)
if err == nil && s.config.DebugOptions != nil && s.config.DebugOptions.PrintHTML {
var b bytes.Buffer
html.Render(&b, ret)
logger.Infof("loadURL (%s) response: \n%s", url, b.String())
}
return ret, err
}
func (s *xpathScraper) getXPathQuery(doc *html.Node) *xpathQuery {
return &xpathQuery{
doc: doc,
scraper: s,
}
}
type xpathQuery struct {
doc *html.Node
scraper *xpathScraper
}
func (q *xpathQuery) runQuery(selector string) []string {
found, err := htmlquery.QueryAll(q.doc, selector)
if err != nil {
logger.Warnf("Error parsing xpath expression '%s': %s", selector, err.Error())
return nil
}
var ret []string
for _, n := range found {
// don't add empty strings
nodeText := q.nodeText(n)
if nodeText != "" {
ret = append(ret, q.nodeText(n))
}
}
return ret
}
func (q *xpathQuery) nodeText(n *html.Node) string {
var ret string
if n != nil && n.Type == html.CommentNode {
ret = htmlquery.OutputHTML(n, true)
}
ret = htmlquery.InnerText(n)
// trim all leading and trailing whitespace
ret = strings.TrimSpace(ret)
// remove multiple whitespace
re := regexp.MustCompile(" +")
ret = re.ReplaceAllString(ret, " ")
// TODO - make this optional
re = regexp.MustCompile("\n")
ret = re.ReplaceAllString(ret, "")
return ret
}
func (q *xpathQuery) subScrape(value string) mappedQuery {
doc, err := q.scraper.loadURL(value)
if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
return nil
}
return q.scraper.getXPathQuery(doc)
}