1
0
mirror of https://github.com/yt-dlp/yt-dlp synced 2025-12-18 07:05:41 +07:00

Compare commits

..

5 Commits

Author SHA1 Message Date
bashonly
ff61bef041 [ie/youtube:tab] Fix flat thumbnails extraction for shorts (#15331)
Closes #15329
Authored by: bashonly
2025-12-15 22:37:25 +00:00
sepro
04f2ec4b97 [ie/parti] Fix extractors (#15319)
Authored by: seproDev
2025-12-13 20:00:56 +01:00
0x∅
b6f24745bf [ie/telecinco] Fix extractor (#15311)
Closes #15240
Authored by: 0xvd, bashonly

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2025-12-12 22:25:45 +00:00
norepro
f2ee2a46fc [ie/pornhub] Optimize metadata extraction (#15231)
Closes #14621
Authored by: norepro
2025-12-12 20:52:09 +00:00
bashonly
5f37f67d37 [ie/archive.org] Fix metadata extraction (#15286)
Closes #15280
Authored by: bashonly
2025-12-09 19:05:12 +00:00
6 changed files with 63 additions and 59 deletions

View File

@@ -261,7 +261,7 @@ def sanitize(key, value):
def expect_info_dict(self, got_dict, expected_dict):
ALLOWED_KEYS_SORT_ORDER = (
# NB: Keep in sync with the docstring of extractor/common.py
'id', 'ext', 'direct', 'display_id', 'title', 'alt_title', 'description', 'media_type',
'ie_key', 'url', 'id', 'ext', 'direct', 'display_id', 'title', 'alt_title', 'description', 'media_type',
'uploader', 'uploader_id', 'uploader_url', 'channel', 'channel_id', 'channel_url', 'channel_is_verified',
'channel_follower_count', 'comment_count', 'view_count', 'concurrent_view_count',
'like_count', 'dislike_count', 'repost_count', 'average_rating', 'age_limit', 'duration', 'thumbnail', 'heatmap',

View File

@@ -279,7 +279,7 @@ def _real_extract(self, url):
'url': 'https://archive.org/' + track['file'].lstrip('/'),
}
metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)
metadata = self._download_json(f'https://archive.org/metadata/{identifier}', identifier)
m = metadata['metadata']
identifier = m['identifier']

View File

@@ -6,7 +6,10 @@
class PartiBaseIE(InfoExtractor):
def _call_api(self, path, video_id, note=None):
return self._download_json(
f'https://api-backend.parti.com/parti_v2/profile/{path}', video_id, note)
f'https://prod-api.parti.com/parti_v2/profile/{path}', video_id, note, headers={
'Origin': 'https://parti.com',
'Referer': 'https://parti.com/',
})
class PartiVideoIE(PartiBaseIE):
@@ -20,7 +23,7 @@ class PartiVideoIE(PartiBaseIE):
'title': 'NOW LIVE ',
'upload_date': '20250327',
'categories': ['Gaming'],
'thumbnail': 'https://assets.parti.com/351424_eb9e5250-2821-484a-9c5f-ca99aa666c87.png',
'thumbnail': 'https://media.parti.com/351424_eb9e5250-2821-484a-9c5f-ca99aa666c87.png',
'channel': 'ItZTMGG',
'timestamp': 1743044379,
},
@@ -34,7 +37,7 @@ def _real_extract(self, url):
return {
'id': video_id,
'formats': self._extract_m3u8_formats(
urljoin('https://watch.parti.com', data['livestream_recording']), video_id, 'mp4'),
urljoin('https://media.parti.com/', data['livestream_recording']), video_id, 'mp4'),
**traverse_obj(data, {
'title': ('event_title', {str}),
'channel': ('user_name', {str}),
@@ -47,32 +50,27 @@ def _real_extract(self, url):
class PartiLivestreamIE(PartiBaseIE):
IE_NAME = 'parti:livestream'
_VALID_URL = r'https?://(?:www\.)?parti\.com/creator/(?P<service>[\w]+)/(?P<id>[\w/-]+)'
_VALID_URL = r'https?://(?:www\.)?parti\.com/(?!video/)(?P<id>[\w/-]+)'
_TESTS = [{
'url': 'https://parti.com/creator/parti/Capt_Robs_Adventures',
'url': 'https://parti.com/247CryptoTracker',
'info_dict': {
'id': 'Capt_Robs_Adventures',
'ext': 'mp4',
'id': '247CryptoTracker',
'description': 'md5:a78051f3d7e66e6a64c6b1eaf59fd364',
'title': r"re:I'm Live on Parti \d{4}-\d{2}-\d{2} \d{2}:\d{2}",
'view_count': int,
'thumbnail': r're:https://assets\.parti\.com/.+\.png',
'timestamp': 1743879776,
'upload_date': '20250405',
'thumbnail': r're:https://media\.parti\.com/stream-screenshots/.+\.png',
'live_status': 'is_live',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://parti.com/creator/discord/sazboxgaming/0',
'only_matching': True,
}]
def _real_extract(self, url):
service, creator_slug = self._match_valid_url(url).group('service', 'id')
creator_slug = self._match_id(url)
encoded_creator_slug = creator_slug.replace('/', '%23')
creator_id = self._call_api(
f'get_user_by_social_media/{service}/{encoded_creator_slug}',
creator_slug, note='Fetching user ID')
f'user_id_from_name/{encoded_creator_slug}',
creator_slug, note='Fetching user ID')['user_id']
data = self._call_api(
f'get_livestream_channel_info/{creator_id}', creator_id,
@@ -85,11 +83,7 @@ def _real_extract(self, url):
return {
'id': creator_slug,
'formats': self._extract_m3u8_formats(
channel_info['playback_url'], creator_slug, live=True, query={
'token': channel_info['playback_auth_token'],
'player_version': '1.17.0',
}),
'formats': self._extract_m3u8_formats(channel_info['playback_url'], creator_slug, live=True),
'is_live': True,
**traverse_obj(data, {
'title': ('livestream_event_info', 'event_name', {str}),

View File

@@ -24,6 +24,7 @@
url_or_none,
urlencode_postdata,
)
from ..utils.traversal import find_elements, traverse_obj
class PornHubBaseIE(InfoExtractor):
@@ -137,23 +138,24 @@ class PornHubIE(PornHubBaseIE):
_EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)']
_TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': 'a6391306d050e4547f62b3f485dd9ba9',
'md5': '4d4a4e9178b655776f86cf89ecaf0edf',
'info_dict': {
'id': '648719015',
'ext': 'mp4',
'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
'uploader': 'Babes',
'uploader': 'BABES-COM',
'uploader_id': '/users/babes-com',
'upload_date': '20130628',
'timestamp': 1372447216,
'duration': 361,
'view_count': int,
'like_count': int,
'dislike_count': int,
'comment_count': int,
'age_limit': 18,
'tags': list,
'categories': list,
'cast': list,
'thumbnail': r're:https?://.+',
},
}, {
# non-ASCII title
@@ -480,13 +482,6 @@ def extract_vote_count(kind, name):
comment_count = self._extract_count(
r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
def extract_list(meta_key):
div = self._search_regex(
rf'(?s)<div[^>]+\bclass=["\'].*?\b{meta_key}Wrapper[^>]*>(.+?)</div>',
webpage, meta_key, default=None)
if div:
return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]
info = self._search_json_ld(webpage, video_id, default={})
# description provided in JSON-LD is irrelevant
info['description'] = None
@@ -505,9 +500,11 @@ def extract_list(meta_key):
'comment_count': comment_count,
'formats': formats,
'age_limit': 18,
'tags': extract_list('tags'),
'categories': extract_list('categories'),
'cast': extract_list('pornstars'),
**traverse_obj(webpage, {
'tags': ({find_elements(attr='data-label', value='tag')}, ..., {clean_html}),
'categories': ({find_elements(attr='data-label', value='category')}, ..., {clean_html}),
'cast': ({find_elements(attr='data-label', value='pornstar')}, ..., {clean_html}),
}),
'subtitles': subtitles,
}, info)

View File

@@ -6,20 +6,21 @@
from ..utils import (
ExtractorError,
clean_html,
extract_attributes,
int_or_none,
join_nonempty,
str_or_none,
traverse_obj,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class TelecincoBaseIE(InfoExtractor):
def _parse_content(self, content, url):
video_id = content['dataMediaId']
video_id = content['dataMediaId'][1]
config = self._download_json(
content['dataConfig'], video_id, 'Downloading config JSON')
content['dataConfig'][1], video_id, 'Downloading config JSON')
services = config['services']
caronte = self._download_json(services['caronte'], video_id)
if traverse_obj(caronte, ('dls', 0, 'drm', {bool})):
@@ -57,9 +58,9 @@ def _parse_content(self, content, url):
'id': video_id,
'title': traverse_obj(config, ('info', 'title', {str})),
'formats': formats,
'thumbnail': (traverse_obj(content, ('dataPoster', {url_or_none}))
'thumbnail': (traverse_obj(content, ('dataPoster', 1, {url_or_none}))
or traverse_obj(config, 'poster', 'imageUrl', expected_type=url_or_none)),
'duration': traverse_obj(content, ('dataDuration', {int_or_none})),
'duration': traverse_obj(content, ('dataDuration', 1, {int_or_none})),
'http_headers': headers,
}
@@ -137,30 +138,45 @@ class TelecincoIE(TelecincoBaseIE):
'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html',
'only_matching': True,
}]
_ASTRO_ISLAND_RE = re.compile(r'<astro-island\b[^>]+>')
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, impersonate=True)
article = self._search_json(
r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=',
webpage, 'article', display_id)['article']
description = traverse_obj(article, ('leadParagraph', {clean_html}, filter))
if article.get('editorialType') != 'VID':
props_list = traverse_obj(webpage, (
{self._ASTRO_ISLAND_RE.findall}, ...,
{extract_attributes}, 'props', {json.loads}))
description = traverse_obj(props_list, (..., 'leadParagraph', 1, {clean_html}, any, filter))
main_content = traverse_obj(props_list, (..., ('content', ('articleData', 1, 'opening')), 1, {dict}, any))
if traverse_obj(props_list, (..., 'editorialType', 1, {str}, any)) != 'VID': # e.g. 'ART'
entries = []
for p in traverse_obj(article, ((('opening', all), 'body'), lambda _, v: v['content'])):
content = p['content']
type_ = p.get('type')
if type_ == 'paragraph' and isinstance(content, str):
for p in traverse_obj(props_list, (..., 'articleData', 1, ('opening', ('body', 1, ...)), 1, {dict})):
type_ = traverse_obj(p, ('type', 1, {str}))
content = traverse_obj(p, ('content', 1, {str} if type_ == 'paragraph' else {dict}))
if not content:
continue
if type_ == 'paragraph':
description = join_nonempty(description, content, delim='')
elif type_ == 'video' and isinstance(content, dict):
elif type_ == 'video':
entries.append(self._parse_content(content, url))
else:
self.report_warning(
f'Skipping unsupported content type "{type_}"', display_id, only_once=True)
return self.playlist_result(
entries, str_or_none(article.get('id')),
traverse_obj(article, ('title', {str})), clean_html(description))
entries,
traverse_obj(props_list, (..., 'id', 1, {int}, {str_or_none}, any)) or display_id,
traverse_obj(main_content, ('dataTitle', 1, {str})),
clean_html(description))
info = self._parse_content(article['opening']['content'], url)
if not main_content:
raise ExtractorError('Unable to extract main content from webpage')
info = self._parse_content(main_content, url)
info['description'] = description
return info

View File

@@ -382,7 +382,8 @@ def _rich_entries(self, rich_grid_renderer):
('accessibilityText', {lambda x: re.fullmatch(r'(.+), (?:[\d,.]+(?:[KM]| million)?|No) views? - play Short', x)}, 1)), any),
'view_count': ('overlayMetadata', 'secondaryText', 'content', {parse_count}),
}),
thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources'))
thumbnails=self._extract_thumbnails(
renderer, ('thumbnailViewModel', 'thumbnailViewModel', 'image'), final_key='sources'))
return
def _video_entry(self, video_renderer):
@@ -1585,7 +1586,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'playlist_count': 50,
'expected_warnings': ['YouTube Music is not directly supported'],
}, {
# TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test
'note': 'unlisted single video playlist',
'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_',
'info_dict': {
@@ -1885,8 +1885,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'playlist_mincount': 30,
}, {
# Shorts url result in shorts tab
# TODO: Fix channel id extraction
# TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test
'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts',
'info_dict': {
'id': 'UCiu-3thuViMebBjw_5nWYrA',
@@ -1915,7 +1913,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'params': {'extract_flat': True},
}, {
# Live video status should be extracted
# TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test
'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live',
'info_dict': {
'id': 'UCQvWX73GQygcwXOTSf_VDVg',