Fix scraping multiple URLs (#5677)

* Hack fix for scraping URLs field
* Rewrite apply function using known value types
This commit is contained in:
WithoutPants 2025-02-26 08:03:08 +11:00 committed by GitHub
parent 587fd9e6b8
commit 1e05766571
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 124 additions and 48 deletions

View file

@ -43,7 +43,9 @@ func (s mappedConfig) applyCommon(c commonMappedConfig, src string) string {
return ret return ret
} }
func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig) mappedResults { type isMultiFunc func(key string) bool
func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonMappedConfig, isMulti isMultiFunc) mappedResults {
var ret mappedResults var ret mappedResults
for k, attrConfig := range s { for k, attrConfig := range s {
@ -51,7 +53,7 @@ func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonM
if attrConfig.Fixed != "" { if attrConfig.Fixed != "" {
// TODO - not sure if this needs to set _all_ indexes for the key // TODO - not sure if this needs to set _all_ indexes for the key
const i = 0 const i = 0
ret = ret.setKey(i, k, attrConfig.Fixed) ret = ret.setSingleValue(i, k, attrConfig.Fixed)
} else { } else {
selector := attrConfig.Selector selector := attrConfig.Selector
selector = s.applyCommon(common, selector) selector = s.applyCommon(common, selector)
@ -63,8 +65,15 @@ func (s mappedConfig) process(ctx context.Context, q mappedQuery, common commonM
if len(found) > 0 { if len(found) > 0 {
result := s.postProcess(ctx, q, attrConfig, found) result := s.postProcess(ctx, q, attrConfig, found)
// HACK - if the key is URLs, then we need to set the value as a multi-value
isMulti := isMulti != nil && isMulti(k)
if isMulti {
ret = ret.setMultiValue(0, k, result)
} else {
for i, text := range result { for i, text := range result {
ret = ret.setKey(i, k, text) ret = ret.setSingleValue(i, k, text)
}
} }
} }
} }
@ -845,37 +854,72 @@ type mappedScraper struct {
Movie *mappedMovieScraperConfig `yaml:"movie"` Movie *mappedMovieScraperConfig `yaml:"movie"`
} }
type mappedResult map[string]string type mappedResult map[string]interface{}
type mappedResults []mappedResult type mappedResults []mappedResult
func (r mappedResult) apply(dest interface{}) { func (r mappedResult) apply(dest interface{}) {
destVal := reflect.ValueOf(dest) destVal := reflect.ValueOf(dest).Elem()
// dest should be a pointer
destVal = destVal.Elem()
// all fields are either string pointers or string slices
for key, value := range r { for key, value := range r {
field := destVal.FieldByName(key) if err := mapFieldValue(destVal, key, value); err != nil {
logger.Errorf("Error mapping field %s in %T: %v", key, dest, err)
if field.IsValid() {
var reflectValue reflect.Value
if field.Kind() == reflect.Ptr {
// need to copy the value, otherwise everything is set to the
// same pointer
localValue := value
reflectValue = reflect.ValueOf(&localValue)
} else {
reflectValue = reflect.ValueOf(value)
}
field.Set(reflectValue)
} else {
logger.Errorf("Field %s does not exist in %T", key, dest)
} }
} }
} }
func (r mappedResults) setKey(index int, key string, value string) mappedResults { func mapFieldValue(destVal reflect.Value, key string, value interface{}) error {
field := destVal.FieldByName(key)
fieldType := field.Type()
if field.IsValid() && field.CanSet() {
switch v := value.(type) {
case string:
// if the field is a pointer to a string, then we need to convert the string to a pointer
// if the field is a string slice, then we need to convert the string to a slice
switch {
case fieldType.Kind() == reflect.String:
field.SetString(v)
case fieldType.Kind() == reflect.Ptr && fieldType.Elem().Kind() == reflect.String:
ptr := reflect.New(fieldType.Elem())
ptr.Elem().SetString(v)
field.Set(ptr)
case fieldType.Kind() == reflect.Slice && fieldType.Elem().Kind() == reflect.String:
field.Set(reflect.ValueOf([]string{v}))
default:
return fmt.Errorf("cannot convert %T to %s", value, fieldType)
}
case []string:
// expect the field to be a string slice
if fieldType.Kind() == reflect.Slice && fieldType.Elem().Kind() == reflect.String {
field.Set(reflect.ValueOf(v))
} else {
return fmt.Errorf("cannot convert %T to %s", value, fieldType)
}
default:
// fallback to reflection
reflectValue := reflect.ValueOf(value)
reflectValueType := reflectValue.Type()
switch {
case reflectValueType.ConvertibleTo(fieldType):
field.Set(reflectValue.Convert(fieldType))
case fieldType.Kind() == reflect.Pointer && reflectValueType.ConvertibleTo(fieldType.Elem()):
ptr := reflect.New(fieldType.Elem())
ptr.Elem().Set(reflectValue.Convert(fieldType.Elem()))
field.Set(ptr)
default:
return fmt.Errorf("cannot convert %T to %s", value, fieldType)
}
}
} else {
return fmt.Errorf("field does not exist or cannot be set")
}
return nil
}
func (r mappedResults) setSingleValue(index int, key string, value string) mappedResults {
if index >= len(r) { if index >= len(r) {
r = append(r, make(mappedResult)) r = append(r, make(mappedResult))
} }
@ -885,6 +929,20 @@ func (r mappedResults) setKey(index int, key string, value string) mappedResults
return r return r
} }
func (r mappedResults) setMultiValue(index int, key string, value []string) mappedResults {
if index >= len(r) {
r = append(r, make(mappedResult))
}
logger.Debugf(`[%d][%s] = %s`, index, key, value)
r[index][key] = value
return r
}
func urlsIsMulti(key string) bool {
return key == "URLs"
}
func (s mappedScraper) scrapePerformer(ctx context.Context, q mappedQuery) (*models.ScrapedPerformer, error) { func (s mappedScraper) scrapePerformer(ctx context.Context, q mappedQuery) (*models.ScrapedPerformer, error) {
var ret models.ScrapedPerformer var ret models.ScrapedPerformer
@ -895,12 +953,12 @@ func (s mappedScraper) scrapePerformer(ctx context.Context, q mappedQuery) (*mod
performerTagsMap := performerMap.Tags performerTagsMap := performerMap.Tags
results := performerMap.process(ctx, q, s.Common) results := performerMap.process(ctx, q, s.Common, urlsIsMulti)
// now apply the tags // now apply the tags
if performerTagsMap != nil { if performerTagsMap != nil {
logger.Debug(`Processing performer tags:`) logger.Debug(`Processing performer tags:`)
tagResults := performerTagsMap.process(ctx, q, s.Common) tagResults := performerTagsMap.process(ctx, q, s.Common, nil)
for _, p := range tagResults { for _, p := range tagResults {
tag := &models.ScrapedTag{} tag := &models.ScrapedTag{}
@ -928,7 +986,8 @@ func (s mappedScraper) scrapePerformers(ctx context.Context, q mappedQuery) ([]*
return nil, nil return nil, nil
} }
results := performerMap.process(ctx, q, s.Common) // isMulti is nil because it will behave incorrect when scraping multiple performers
results := performerMap.process(ctx, q, s.Common, nil)
for _, r := range results { for _, r := range results {
var p models.ScrapedPerformer var p models.ScrapedPerformer
r.apply(&p) r.apply(&p)
@ -957,7 +1016,7 @@ func (s mappedScraper) processSceneRelationships(ctx context.Context, q mappedQu
if sceneStudioMap != nil { if sceneStudioMap != nil {
logger.Debug(`Processing scene studio:`) logger.Debug(`Processing scene studio:`)
studioResults := sceneStudioMap.process(ctx, q, s.Common) studioResults := sceneStudioMap.process(ctx, q, s.Common, nil)
if len(studioResults) > 0 && resultIndex < len(studioResults) { if len(studioResults) > 0 && resultIndex < len(studioResults) {
studio := &models.ScrapedStudio{} studio := &models.ScrapedStudio{}
@ -981,14 +1040,15 @@ func (s mappedScraper) processPerformers(ctx context.Context, performersMap mapp
// now apply the performers and tags // now apply the performers and tags
if performersMap.mappedConfig != nil { if performersMap.mappedConfig != nil {
logger.Debug(`Processing performers:`) logger.Debug(`Processing performers:`)
performerResults := performersMap.process(ctx, q, s.Common) // isMulti is nil because it will behave incorrect when scraping multiple performers
performerResults := performersMap.process(ctx, q, s.Common, nil)
scenePerformerTagsMap := performersMap.Tags scenePerformerTagsMap := performersMap.Tags
// process performer tags once // process performer tags once
var performerTagResults mappedResults var performerTagResults mappedResults
if scenePerformerTagsMap != nil { if scenePerformerTagsMap != nil {
performerTagResults = scenePerformerTagsMap.process(ctx, q, s.Common) performerTagResults = scenePerformerTagsMap.process(ctx, q, s.Common, nil)
} }
for _, p := range performerResults { for _, p := range performerResults {
@ -1011,7 +1071,7 @@ func (s mappedScraper) processPerformers(ctx context.Context, performersMap mapp
func processRelationships[T any](ctx context.Context, s mappedScraper, relationshipMap mappedConfig, q mappedQuery) []*T { func processRelationships[T any](ctx context.Context, s mappedScraper, relationshipMap mappedConfig, q mappedQuery) []*T {
var ret []*T var ret []*T
results := relationshipMap.process(ctx, q, s.Common) results := relationshipMap.process(ctx, q, s.Common, nil)
for _, p := range results { for _, p := range results {
var value T var value T
@ -1032,7 +1092,8 @@ func (s mappedScraper) scrapeScenes(ctx context.Context, q mappedQuery) ([]*Scra
} }
logger.Debug(`Processing scenes:`) logger.Debug(`Processing scenes:`)
results := sceneMap.process(ctx, q, s.Common) // urlsIsMulti is nil because it will behave incorrect when scraping multiple scenes
results := sceneMap.process(ctx, q, s.Common, nil)
for i, r := range results { for i, r := range results {
logger.Debug(`Processing scene:`) logger.Debug(`Processing scene:`)
@ -1054,7 +1115,7 @@ func (s mappedScraper) scrapeScene(ctx context.Context, q mappedQuery) (*Scraped
sceneMap := sceneScraperConfig.mappedConfig sceneMap := sceneScraperConfig.mappedConfig
logger.Debug(`Processing scene:`) logger.Debug(`Processing scene:`)
results := sceneMap.process(ctx, q, s.Common) results := sceneMap.process(ctx, q, s.Common, urlsIsMulti)
var ret ScrapedScene var ret ScrapedScene
if len(results) > 0 { if len(results) > 0 {
@ -1087,7 +1148,7 @@ func (s mappedScraper) scrapeImage(ctx context.Context, q mappedQuery) (*Scraped
imageStudioMap := imageScraperConfig.Studio imageStudioMap := imageScraperConfig.Studio
logger.Debug(`Processing image:`) logger.Debug(`Processing image:`)
results := imageMap.process(ctx, q, s.Common) results := imageMap.process(ctx, q, s.Common, urlsIsMulti)
// now apply the performers and tags // now apply the performers and tags
if imagePerformersMap != nil { if imagePerformersMap != nil {
@ -1102,7 +1163,7 @@ func (s mappedScraper) scrapeImage(ctx context.Context, q mappedQuery) (*Scraped
if imageStudioMap != nil { if imageStudioMap != nil {
logger.Debug(`Processing image studio:`) logger.Debug(`Processing image studio:`)
studioResults := imageStudioMap.process(ctx, q, s.Common) studioResults := imageStudioMap.process(ctx, q, s.Common, nil)
if len(studioResults) > 0 { if len(studioResults) > 0 {
studio := &models.ScrapedStudio{} studio := &models.ScrapedStudio{}
@ -1138,12 +1199,12 @@ func (s mappedScraper) scrapeGallery(ctx context.Context, q mappedQuery) (*Scrap
galleryStudioMap := galleryScraperConfig.Studio galleryStudioMap := galleryScraperConfig.Studio
logger.Debug(`Processing gallery:`) logger.Debug(`Processing gallery:`)
results := galleryMap.process(ctx, q, s.Common) results := galleryMap.process(ctx, q, s.Common, urlsIsMulti)
// now apply the performers and tags // now apply the performers and tags
if galleryPerformersMap != nil { if galleryPerformersMap != nil {
logger.Debug(`Processing gallery performers:`) logger.Debug(`Processing gallery performers:`)
performerResults := galleryPerformersMap.process(ctx, q, s.Common) performerResults := galleryPerformersMap.process(ctx, q, s.Common, urlsIsMulti)
for _, p := range performerResults { for _, p := range performerResults {
performer := &models.ScrapedPerformer{} performer := &models.ScrapedPerformer{}
@ -1154,7 +1215,7 @@ func (s mappedScraper) scrapeGallery(ctx context.Context, q mappedQuery) (*Scrap
if galleryTagsMap != nil { if galleryTagsMap != nil {
logger.Debug(`Processing gallery tags:`) logger.Debug(`Processing gallery tags:`)
tagResults := galleryTagsMap.process(ctx, q, s.Common) tagResults := galleryTagsMap.process(ctx, q, s.Common, nil)
for _, p := range tagResults { for _, p := range tagResults {
tag := &models.ScrapedTag{} tag := &models.ScrapedTag{}
@ -1165,7 +1226,7 @@ func (s mappedScraper) scrapeGallery(ctx context.Context, q mappedQuery) (*Scrap
if galleryStudioMap != nil { if galleryStudioMap != nil {
logger.Debug(`Processing gallery studio:`) logger.Debug(`Processing gallery studio:`)
studioResults := galleryStudioMap.process(ctx, q, s.Common) studioResults := galleryStudioMap.process(ctx, q, s.Common, nil)
if len(studioResults) > 0 { if len(studioResults) > 0 {
studio := &models.ScrapedStudio{} studio := &models.ScrapedStudio{}
@ -1199,11 +1260,11 @@ func (s mappedScraper) scrapeGroup(ctx context.Context, q mappedQuery) (*models.
movieStudioMap := movieScraperConfig.Studio movieStudioMap := movieScraperConfig.Studio
movieTagsMap := movieScraperConfig.Tags movieTagsMap := movieScraperConfig.Tags
results := movieMap.process(ctx, q, s.Common) results := movieMap.process(ctx, q, s.Common, urlsIsMulti)
if movieStudioMap != nil { if movieStudioMap != nil {
logger.Debug(`Processing movie studio:`) logger.Debug(`Processing movie studio:`)
studioResults := movieStudioMap.process(ctx, q, s.Common) studioResults := movieStudioMap.process(ctx, q, s.Common, nil)
if len(studioResults) > 0 { if len(studioResults) > 0 {
studio := &models.ScrapedStudio{} studio := &models.ScrapedStudio{}
@ -1215,7 +1276,7 @@ func (s mappedScraper) scrapeGroup(ctx context.Context, q mappedQuery) (*models.
// now apply the tags // now apply the tags
if movieTagsMap != nil { if movieTagsMap != nil {
logger.Debug(`Processing movie tags:`) logger.Debug(`Processing movie tags:`)
tagResults := movieTagsMap.process(ctx, q, s.Common) tagResults := movieTagsMap.process(ctx, q, s.Common, nil)
for _, p := range tagResults { for _, p := range tagResults {
tag := &models.ScrapedTag{} tag := &models.ScrapedTag{}

View file

@ -32,6 +32,7 @@ const htmlDoc1 = `
</td> </td>
<td class="paramvalue"> <td class="paramvalue">
<a href="/html/m_links/Mia_Malkova/">Mia Malkova</a>&nbsp; <a href="/html/m_links/Mia_Malkova/">Mia Malkova</a>&nbsp;
<a href="/html/m_links/Mia_Malkova/second_url">Mia Malkova</a>&nbsp;
</td> </td>
</tr> </tr>
<tr> <tr>
@ -206,6 +207,8 @@ func makeXPathConfig() mappedPerformerScraperConfig {
} }
config.mappedConfig["Name"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a`) config.mappedConfig["Name"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a`)
config.mappedConfig["URL"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a/@href`)
config.mappedConfig["URLs"] = makeSimpleAttrConfig(makeCommonXPath("Babe Name:") + `/a/@href`)
config.mappedConfig["Ethnicity"] = makeSimpleAttrConfig(makeCommonXPath("Ethnicity:")) config.mappedConfig["Ethnicity"] = makeSimpleAttrConfig(makeCommonXPath("Ethnicity:"))
config.mappedConfig["Aliases"] = makeSimpleAttrConfig(makeCommonXPath("Aliases:")) config.mappedConfig["Aliases"] = makeSimpleAttrConfig(makeCommonXPath("Aliases:"))
config.mappedConfig["EyeColor"] = makeSimpleAttrConfig(makeCommonXPath("Eye Color:")) config.mappedConfig["EyeColor"] = makeSimpleAttrConfig(makeCommonXPath("Eye Color:"))
@ -321,6 +324,8 @@ func TestScrapePerformerXPath(t *testing.T) {
} }
const performerName = "Mia Malkova" const performerName = "Mia Malkova"
const url = "/html/m_links/Mia_Malkova/"
const secondURL = "/html/m_links/Mia_Malkova/second_url"
const ethnicity = "Caucasian" const ethnicity = "Caucasian"
const country = "United States" const country = "United States"
const birthdate = "1992-07-01" const birthdate = "1992-07-01"
@ -338,6 +343,16 @@ func TestScrapePerformerXPath(t *testing.T) {
const weight = "57" // 126 lb const weight = "57" // 126 lb
verifyField(t, performerName, performer.Name, "Name") verifyField(t, performerName, performer.Name, "Name")
verifyField(t, url, performer.URL, "URL")
// #5294 - test multiple URLs
if len(performer.URLs) != 2 {
t.Errorf("Expected 2 URLs, got %d", len(performer.URLs))
} else {
verifyField(t, url, &performer.URLs[0], "URLs[0]")
verifyField(t, secondURL, &performer.URLs[1], "URLs[1]")
}
verifyField(t, gender, performer.Gender, "Gender") verifyField(t, gender, performer.Gender, "Gender")
verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity") verifyField(t, ethnicity, performer.Ethnicity, "Ethnicity")
verifyField(t, country, performer.Country, "Country") verifyField(t, country, performer.Country, "Country")
@ -569,7 +584,7 @@ func makeSceneXPathConfig() mappedScraper {
performerConfig := make(mappedConfig) performerConfig := make(mappedConfig)
performerConfig["Name"] = makeSimpleAttrConfig(`$performerElem/@data-mxptext`) performerConfig["Name"] = makeSimpleAttrConfig(`$performerElem/@data-mxptext`)
performerConfig["URL"] = makeSimpleAttrConfig(`$performerElem/@href`) performerConfig["URLs"] = makeSimpleAttrConfig(`$performerElem/@href`)
config.Performers.mappedConfig = performerConfig config.Performers.mappedConfig = performerConfig
studioConfig := make(mappedConfig) studioConfig := make(mappedConfig)
@ -653,8 +668,8 @@ func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []strin
} }
if i < len(actualPerformers) { if i < len(actualPerformers) {
actualName = *actualPerformers[i].Name actualName = *actualPerformers[i].Name
if actualPerformers[i].URL != nil { if len(actualPerformers[i].URLs) == 1 {
actualURL = *actualPerformers[i].URL actualURL = actualPerformers[i].URLs[0]
} }
} }
@ -805,7 +820,7 @@ func TestLoadInvalidXPath(t *testing.T) {
doc: doc, doc: doc,
} }
config.process(context.Background(), q, nil) config.process(context.Background(), q, nil, nil)
} }
type mockGlobalConfig struct{} type mockGlobalConfig struct{}