Skip to content

Commit

Permalink
Rework on subscribestar extractor
Browse files Browse the repository at this point in the history
 - New support for embedded audios
 - New support for external links compatible with yt-dlp
 - Add a content_type field at the post level for directory creation
 - Major rework of the logic
 - Added a check_if_supported_by_ytdlp helper function in util.py
   for yt-dlp external links handling
  • Loading branch information
WyohKnott committed May 11, 2024
1 parent d2f50ec commit 259842f
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 46 deletions.
132 changes: 87 additions & 45 deletions gallery_dl/extractor/subscribestar.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

# Copyright 2020-2023 Mike Fährmann
# Copyright 2020-2024 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
Expand All @@ -17,6 +17,7 @@

class SubscribestarExtractor(Extractor):
"""Base class for subscribestar extractors"""

category = "subscribestar"
root = "https://www.subscribestar.com"
directory_fmt = ("{category}", "{author_name}")
Expand Down Expand Up @@ -74,6 +75,7 @@ def _login_impl(self, username, password):

response = self.request(
url, method="POST", headers=headers, data=data, fatal=False)

if response.json().get("errors"):
self.log.debug(response.json()["errors"])
raise exception.AuthenticationError()
Expand All @@ -84,44 +86,98 @@ def _login_impl(self, username, password):
if cookie.name.startswith("auth")
}

def _media_from_post(self, html):
def _extract_media(self, html, media_types):
media = []
media_config = {
"gallery": ('data-gallery="', '"', self._process_gallery_item),
"attachments": (
'class="uploads-docs"',
'data-role="post-edit_form"',
self._process_attachment_item,
),
"link": ('data-href="', '"', self._process_media_item),
"audio": ('<source src="', '" type="audio/',
self._process_media_item),
}

gallery = text.extr(html, 'data-gallery="', '"')
if gallery:
for item in util.json_loads(text.unescape(gallery)):
if "/previews" in item["url"]:
self._warn_preview()
else:
media.append(item)

attachments = text.extr(
html, 'class="uploads-docs"', 'data-role="post-edit_form"')
if attachments:
for att in attachments.split('class="doc_preview"')[1:]:
media.append({
"id" : text.parse_int(text.extr(
att, 'data-upload-id="', '"')),
"name": text.unescape(text.extr(
att, 'doc_preview-title">', '<')),
"url" : text.unescape(text.extr(att, 'href="', '"')),
"type": "attachment",
})
for key, config in media_types.items():
if key in media_config:
start, end, processor = media_config[key]
segments = (
text.extract_all(
html,
((key, start, end),),
)[0],
)
for segment in segments:
if segment[key]:
content = processor(segment, key)
if content:
media.append(content)
return media

def _process_gallery_item(self, item, media_type):
gallery_list = util.json_loads(text.unescape(item["gallery"]))
for media in gallery_list:
if "/previews" in media["url"]:
self._warn_preview()
return {"url": media["url"], "type": media_type}

def _process_attachment_item(self, item, media_type):
return {
"id": text.parse_int(text.extr(item, 'data-upload-id="', '"')),
"name": text.unescape(text.extr(item, 'doc_preview-title">', "<")),
"url": text.unescape(text.extr(item, 'href="', '"')),
"type": media_type,
}

def _process_media_item(self, item, media_type):
if media_type == "link" and util.check_if_supported_by_ytdlp(
item[media_type]):
return {"url": "ytdl:" + item[media_type], "type": media_type}
elif media_type == "audio":
return {"url": item[media_type], "type": media_type}

def _media_from_post(self, html):
media_types = {
"gallery": True,
"attachments": True,
"link": True,
"audio": True,
}
media = self._extract_media(html, media_types)
return media

def _data_from_post(self, html):
extr = text.extract_from(html)

links = (text.extract_all(html, (("url", 'data-href="', '"'),), )[0],)
audios = (text.extract_all(html, (("url", '<source src="',
'" type="audio/'),),)[0],)
gallery = text.extr(html, 'data-gallery="', '"')

content_type = None
if links and any(item["url"] for item in links):
content_type = "link"
if audios and any(item["url"] for item in audios):
content_type = "audio"
if gallery:
for item in util.json_loads(text.unescape(gallery)):
if item["type"] == "video":
content_type = "video"
break
else:
content_type = "image"

return {
"post_id" : text.parse_int(extr('data-id="', '"')),
"author_id" : text.parse_int(extr('data-user-id="', '"')),
"author_name": text.unescape(extr('href="/', '"')),
"author_nick": text.unescape(extr('>', '<')),
"date" : self._parse_datetime(extr(
'class="post-date">', '</').rpartition(">")[2]),
"content" : (extr(
'<div class="post-content', '<div class="post-uploads')
.partition(">")[2]),
"post_id" : text.parse_int(extr('data-id="', '"')),
"author_id" : text.parse_int(extr('data-user-id="', '"')),
"author_name" : text.unescape(extr('href="/', '"')),
"author_nick" : text.unescape(extr(">", "<")),
"date" : self._parse_datetime(
extr('class="post-date">', "</").rpartition(">")[2]),
"content" : extr("<body>\n", "\n</body>"),
"content_type" : content_type,
}

def _parse_datetime(self, dt):
Expand Down Expand Up @@ -166,17 +222,3 @@ class SubscribestarPostExtractor(SubscribestarExtractor):
def posts(self):
url = "{}/posts/{}".format(self.root, self.item)
return (self.request(url).text,)

def _data_from_post(self, html):
extr = text.extract_from(html)
return {
"post_id" : text.parse_int(extr('data-id="', '"')),
"author_name": text.unescape(extr('href="/', '"')),
"author_id" : text.parse_int(extr('data-user-id="', '"')),
"author_nick": text.unescape(extr('alt="', '"')),
"date" : self._parse_datetime(extr(
'<span class="star_link-types">', '<')),
"content" : (extr(
'<div class="post-content', '<div class="post-uploads')
.partition(">")[2]),
}
11 changes: 10 additions & 1 deletion gallery_dl/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import urllib.parse
from http.cookiejar import Cookie
from email.utils import mktime_tz, parsedate_tz
from . import text, version, exception
from . import text, version, exception, ytdl, config


def bencode(num, alphabet="0123456789"):
Expand Down Expand Up @@ -496,6 +496,15 @@ def language_to_code(lang, default=None):
}


def check_if_supported_by_ytdlp(url):
ytdl_module = ytdl.import_module(
config.get(("extractor", "ytdl"), "module"))
for ie in ytdl_module.extractor.gen_extractor_classes():
if ie.suitable(url):
return True
return False


class HTTPBasicAuth():
__slots__ = ("authorization",)

Expand Down

0 comments on commit 259842f

Please sign in to comment.