Skip to content

Commit

Permalink
Merge pull request #1846 from diamondpete/sitechanges
Browse files Browse the repository at this point in the history
Aussie Ass and 5KPorn Scraper Cleanup
  • Loading branch information
DirtyRacer1337 authored Sep 30, 2023
2 parents b41c5be + 2c123ed commit 753780d
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 45 deletions.
2 changes: 2 additions & 0 deletions Contents/Code/PAdatabaseActors.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@
'Miss Jade Indica': ['Jade Indica'],
'Moe Johnson': ['Moe The Monster Johnson'],
'Monica Sweet': ['Jo', 'Monica Naggy', 'Monika Sweet'],
'Monte Luxe': ['Monte Cooper'],
'Monty Memphis': ['Monty', 'Monty Cash', 'Monty Manthis', 'Monty Mathis'],
'Murgur': ['Mogur', 'Mugor', 'Mugur Porn', 'Mugur'],
'Mya Mason': ['Maya Mason', 'Maya Madison', 'Mya Manson', 'Mya Madison'],
Expand Down Expand Up @@ -273,6 +274,7 @@
'Rosaline Rosa': ['Rosaline Rose'],
'Ruckus XXX': ['Ruckus'],
'Ruthless Kid': ['Lil D', 'Lil D.'],
'Samm Rosee': ['Sammrosee'],
'Sara Luvv': ['Sara Luv'],
'Sara Rich': ['Nina Winslet', 'Mia Park', 'Mia Parker', 'Veronica Rich', 'Liloo Von'],
'Sasha D': ['Sasha (V)', 'Sandra'],
Expand Down
56 changes: 46 additions & 10 deletions Contents/Code/network5Kporn.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,33 @@ def search(results, lang, siteNum, searchData):
cookies = {'nats': 'MC4wLjMuNTguMC4wLjAuMC4w'}
req = PAutils.HTTPRequest(PAsearchSites.getSearchSearchURL(siteNum) + searchData.encoded, cookies=cookies)
searchResults = HTML.ElementFromString(req.json()['html'])
for sceneURL in searchResults.xpath('//div[@class="ep"]'):
titleNoFormatting = searchResults.xpath('.//h3[@class="ep-title"]')[0].text_content().strip()
sceneURL = searchResults.xpath('.//a/@href')[0]
for searchResult in searchResults.xpath('//div[@class="ep"]'):
titleNoFormatting = searchResult.xpath('.//h3[@class="ep-title"]')[0].text_content().strip()
sceneURL = searchResult.xpath('.//a/@href')[0]
curID = PAutils.Encode(sceneURL)

score = 100 - Util.LevenshteinDistance(searchData.title.lower(), titleNoFormatting.lower())
releaseDate = searchData.dateFormat() if searchData.date else ''

results.Append(MetadataSearchResult(id='%s|%d' % (curID, siteNum), name='%s [%s]' % (titleNoFormatting, PAsearchSites.getSearchSiteName(siteNum)), score=score, lang=lang))
score = 100 - Util.LevenshteinDistance(titleNoActors.lower(), titleNoFormatting.lower())

results.Append(MetadataSearchResult(id='%s|%d|%s' % (curID, siteNum, releaseDate), name='%s [%s]' % (titleNoFormatting, PAsearchSites.getSearchSiteName(siteNum)), score=score, lang=lang))

return results


def update(metadata, lang, siteNum, movieGenres, movieActors, art):
metadata_id = str(metadata.id).split('|')
sceneURL = PAutils.Decode(metadata_id[0])
sceneDate = ''
if len(metadata_id) > 2:
sceneDate = metadata_id[2]

cookies = {'nats': 'MC4wLjMuNTguMC4wLjAuMC4w'}
req = PAutils.HTTPRequest(sceneURL, cookies=cookies)
detailsPageElements = HTML.ElementFromString(req.text)

# Title
metadata.title = detailsPageElements.xpath('//title')[0].text_content().split('|')[0]
metadata.title = PAutils.parseTitle(detailsPageElements.xpath('//title')[0].text_content().split('|')[0], siteNum)

# Summary
metadata.summary = detailsPageElements.xpath('//div[contains(@class, "video-summary")]//p[@class=""]')[0].text_content()
Expand All @@ -46,6 +51,10 @@ def update(metadata, lang, siteNum, movieGenres, movieActors, art):
date_object = parse(date)
metadata.originally_available_at = date_object
metadata.year = metadata.originally_available_at.year
elif sceneDate:
date_object = parse(sceneDate)
metadata.originally_available_at = date_object
metadata.year = metadata.originally_available_at.year

# Genres

Expand Down Expand Up @@ -74,7 +83,17 @@ def update(metadata, lang, siteNum, movieGenres, movieActors, art):
for img in detailsPageElements.xpath(xpath):
art.append(img)

for idx in range(1, 3):
photoPageURL = '%s/photoset?page=%d' % (sceneURL, idx)
req = PAutils.HTTPRequest(photoPageURL, cookies=cookies)
photoPageElements = HTML.ElementFromString(req.text)
for img in photoPageElements.xpath('//img[@class="card-img-top"]/@src'):
if 'full' not in img:
art.append(img)

Log('Artwork found: %d' % len(art))
images = []
posterExists = False
for idx, posterUrl in enumerate(art, 1):
if not PAsearchSites.posterAlreadyExists(posterUrl, metadata):
# Download image file for analysis
Expand All @@ -84,12 +103,29 @@ def update(metadata, lang, siteNum, movieGenres, movieActors, art):
resized_image = Image.open(im)
width, height = resized_image.size
# Add the image proxy items to the collection
if width > 1 or height > width:
if height > width:
# Item is a poster
metadata.posters[posterUrl] = Proxy.Media(image.content, sort_order=idx)
if width > 100 and width > height:
metadata.posters[cleanUrl] = Proxy.Media(image.content, sort_order=idx)
posterExists = True
if width > 1000 and width > height:
# Item is an art item
metadata.art[posterUrl] = Proxy.Media(image.content, sort_order=idx)
images.append((image, cleanUrl))
metadata.art[cleanUrl] = Proxy.Media(image.content, sort_order=idx)
except:
pass
elif PAsearchSites.posterOnlyAlreadyExists(cleanUrl, metadata):
posterExists = True

if not posterExists:
for idx, (image, posterUrl) in enumerate(images, 1):
try:
im = StringIO(image.content)
resized_image = Image.open(im)
width, height = resized_image.size
# Add the image proxy items to the collection
if width > 1:
# Item is a poster
metadata.posters[posterUrl] = Proxy.Media(image.content, sort_order=idx)
except:
pass

Expand Down
110 changes: 75 additions & 35 deletions Contents/Code/siteAussieAss.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,43 @@


def search(results, lang, siteNum, searchData):

sceneID = re.sub(r'\D.*', '', searchData.title)
sceneID = None
parts = searchData.title.split()
if unicode(parts[0], 'UTF-8').isdigit():
sceneID = parts[0]
searchData.title = searchData.title.replace(sceneID, '', 1).strip().replace('\'', '')

if sceneID:
sceneURL = PAsearchSites.getSearchBaseURL(siteNum) + "/webmasters/" + sceneID
sceneURL = PAsearchSites.getSearchBaseURL(siteNum) + "/webmasters/" + re.sub(r'^0+', '', sceneID)
req = PAutils.HTTPRequest(sceneURL)
searchResults = HTML.ElementFromString(req.text)
titleNoFormatting = re.sub(r'^\d+', '', searchResults.xpath('//h1/text()')[0].title())
try:
titleNoFormatting = PAutils.parseTitle(re.sub(r'^\d+', '', searchResults.xpath('//h1/text()|//h4/span/text()')[0]).strip().lower(), siteNum)
except:
try:
sceneURL = '%s/tour/updates/%s-%s.html' % (PAsearchSites.getSearchBaseURL(siteNum), sceneID, slugify(searchData.title))
req = PAutils.HTTPRequest(sceneURL)
searchResults = HTML.ElementFromString(req.text)
titleNoFormatting = PAutils.parseTitle(re.sub(r'^\d+', '', searchResults.xpath('//h4/span/text()')[0]).strip().lower(), siteNum)
except:
try:
sceneURL = '%s/tour/updates/%s.html' % (PAsearchSites.getSearchBaseURL(siteNum), re.sub(r'\W', '', searchData.title))
req = PAutils.HTTPRequest(sceneURL)
searchResults = HTML.ElementFromString(req.text)
titleNoFormatting = PAutils.parseTitle(re.sub(r'^\d+', '', searchResults.xpath('//h1/text()|//h4/span/text()')[0]).strip().lower(), siteNum)
except:
sceneURL = '%s/tour/updates/%s.html' % (PAsearchSites.getSearchBaseURL(siteNum), slugify(searchData.title))
req = PAutils.HTTPRequest(sceneURL)
searchResults = HTML.ElementFromString(req.text)
titleNoFormatting = PAutils.parseTitle(re.sub(r'^\d+', '', searchResults.xpath('//h1/text()|//h4/span/text()')[0]).strip().lower(), siteNum)

curID = PAutils.Encode(sceneURL)

releaseDate = searchData.dateFormat() if searchData.date else ''

score = 100

results.Append(MetadataSearchResult(id='%s|%d' % (curID, siteNum), name='%s [%s]' % (titleNoFormatting, PAsearchSites.getSearchSiteName(siteNum)), score=score, lang=lang))
results.Append(MetadataSearchResult(id='%s|%d|%s' % (curID, siteNum, releaseDate), name='%s [%s]' % (titleNoFormatting, PAsearchSites.getSearchSiteName(siteNum)), score=score, lang=lang))
else:
# Handle 3 Types of Links: First, Last; First Only; First-Last
try:
Expand Down Expand Up @@ -51,9 +75,9 @@ def search(results, lang, siteNum, searchData):
req = PAutils.HTTPRequest(PAsearchSites.getSearchBaseURL(siteNum) + searchResults.xpath('//a[contains(@class, "in_stditem")]/@href')[1])
searchResults = HTML.ElementFromString(req.text)
for searchResult in searchResults.xpath('//div[@class="infos"]'):
resultTitleID = searchResult.xpath('.//span[@class="video-title"]')[0].text_content().strip().title()
resultTitleID = searchResult.xpath('.//span[@class="video-title"]')[0].text_content().strip()

titleNoFormatting = re.sub(r'^\d+', '', resultTitleID)
titleNoFormatting = PAutils.parseTitle(re.sub(r'^\d+', '', resultTitleID).lower(), siteNum)

resultID = re.sub(r'\D.*', '', resultTitleID)

Expand Down Expand Up @@ -84,21 +108,20 @@ def search(results, lang, siteNum, searchData):
def update(metadata, lang, siteNum, movieGenres, movieActors, art):
metadata_id = str(metadata.id).split('|')
sceneURL = PAutils.Decode(metadata_id[0])
try:
sceneDate = ''
if len(metadata_id) > 2:
sceneDate = metadata_id[2]
except:
pass
req = PAutils.HTTPRequest(sceneURL)
detailsPageElements = HTML.ElementFromString(req.text)

# Title
if 'webmasters' in sceneURL:
resultTitleID = detailsPageElements.xpath('//h1/text()')[0]
resultTitleID = detailsPageElements.xpath('//h1/text()')[0].strip()
else:
resultTitleID = detailsPageElements.xpath('//h4/span')[0].text_content()
resultTitleID = detailsPageElements.xpath('//h4/span')[0].text_content().strip()

sceneID = re.sub(r'\D.*', '', resultTitleID)
metadata.title = PAutils.parseTitle(re.sub(r'^\d+', '', resultTitleID), siteNum)
metadata.title = PAutils.parseTitle(re.sub(r'^\d+', '', resultTitleID).strip().lower(), siteNum)

# Summary
try:
Expand All @@ -119,35 +142,41 @@ def update(metadata, lang, siteNum, movieGenres, movieActors, art):
else:
actors = detailsPageElements.xpath('//h5//a')

actorPhotoURL = ''

# Remove Actor Names from Genre List
genres = detailsPageElements.xpath('//meta[@name="keywords"]/@content')[0].replace('Aussie Ass', '')
genres = re.sub(r'id.\d*', '', genres, flags=re.IGNORECASE)
try:
genres = detailsPageElements.xpath('//meta[@name="keywords"]/@content')[0].replace('Aussie Ass', '')
genres = re.sub(r'id.\d*', '', genres, flags=re.IGNORECASE).lower()
except:
genres = ''

for key, values in actorsDB.items():
for item in values:
if item.lower() in genres:
genres = genres.replace(item.lower(), '')

if actors:
for actorLink in actors:
actorName = actorLink.text_content().title()
genres = genres.replace(actorName, '')
genres = genres.replace(actorName.lower(), '')

modelURL = actorLink.xpath('./@href')[0]
modelURL = actorLink.xpath('./@href')[0].replace('MonteCooper', 'MonteLuxe')
req = PAutils.HTTPRequest(modelURL)
actorsPageElements = HTML.ElementFromString(req.text)

img = actorsPageElements.xpath('//img[contains(@id, "set-target")]/@src')[0]
if img:
actorPhotoURL = img
if 'http' not in actorPhotoURL:
actorPhotoURL = PAsearchSites.getSearchBaseURL(siteNum) + actorPhotoURL
try:
img = actorsPageElements.xpath('//img[contains(@id, "set-target")]/@src')[0]
if img:
if 'http' not in img:
actorPhotoURL = PAsearchSites.getSearchBaseURL(siteNum) + img
except:
actorPhotoURL = ''

movieActors.addActor(actorName, actorPhotoURL)

# Date
date = ""

date = ''
try:
if 'webmasters' in sceneURL:

pageResults = (int)(actorsPageElements.xpath('//span[@class="number_item "]')[0].text_content().strip())

if not pageResults:
Expand All @@ -156,23 +185,27 @@ def update(metadata, lang, siteNum, movieGenres, movieActors, art):
for x in range(pageResults):
if x == 1:
actorsPageElements.xpath('//a[contains(@class, "in_stditem")]/@href')[1]
req = PAutils.HTTPRequest(PAsearchSites.getSearchBaseURL(siteNum) + actorsPageElements.xpath('//a[contains(@class, "in_stditem")]/@href')[1])
actorPageURL = '%s/%s' % (PAsearchSites.getSearchBaseURL(siteNum), actorsPageElements.xpath('//a[contains(@class, "in_stditem")]/@href')[1])
req = PAutils.HTTPRequest(actorPageURL)
actorsPageElements = HTML.ElementFromString(req.text)

for sceneElements in actorsPageElements.xpath('//div[@class="box"]'):
if sceneID in sceneElements.xpath('.//a/text()')[1]:
date = actorsPageElements.xpath('.//span[@class="video-date"]')[0].text_content().strip()
break
else:
date = sceneDate
except:
date = sceneDate
pass

if date:
date = parse(date).strftime('%d-%m-%Y')
date_object = datetime.strptime(date, '%d-%m-%Y')
metadata.originally_available_at = date_object
metadata.year = metadata.originally_available_at.year
elif sceneDate:
date = parse(sceneDate).strftime('%d-%m-%Y')
date_object = datetime.strptime(date, '%d-%m-%Y')
metadata.originally_available_at = date_object
metadata.year = metadata.originally_available_at.year

# Genres
for genreLink in genres.split(','):
Expand All @@ -186,8 +219,7 @@ def update(metadata, lang, siteNum, movieGenres, movieActors, art):
'//div[@class="box"]//img/@src',
]

altURL = ""

altURL = ''
for xpath in xpaths:
for img in detailsPageElements.xpath(xpath):
if 'http' not in img:
Expand All @@ -196,10 +228,10 @@ def update(metadata, lang, siteNum, movieGenres, movieActors, art):
elif 'webmasters' in sceneURL:
img = sceneURL + "/" + img
else:
img = PAsearchSites.getSearchBaseURL(siteNum) + img
img = '%s/%s' % (PAsearchSites.getSearchBaseURL(siteNum), img)
art.append(img)
if 'webmasters' not in sceneURL:
altURL = PAsearchSites.getSearchBaseURL(siteNum) + "/webmasters/" + sceneID
altURL = PAsearchSites.getSearchBaseURL(siteNum) + "/webmasters/" + re.sub(r'^0+', '', sceneID)
req = PAutils.HTTPRequest(altURL)
detailsPageElements = HTML.ElementFromString(req.text)
sceneURL = altURL
Expand All @@ -224,3 +256,11 @@ def update(metadata, lang, siteNum, movieGenres, movieActors, art):
pass

return metadata


actorsDB = {
'Belinda Belfast': ['belinda belfast'],
'Charlotte Star': ['charlotte,star'],
'Charlie Brookes': ['charlie, brookes', 'charlie'],
'Monte Cooper': ['monte, cooper', 'monte cooper'],
}

0 comments on commit 753780d

Please sign in to comment.