1
0
mirror of https://github.com/yt-dlp/yt-dlp synced 2025-12-16 22:25:40 +07:00

Compare commits

...

5 Commits

Author SHA1 Message Date
Mr Flamel
854fded114 [ie/TheChosen] Add extractors (#14183)
Closes #11246
Authored by: mrFlamel
2025-11-17 00:17:55 +01:00
Anton Larionov
5f66ac71f6 [ie/mave:channel] Add extractor (#14915)
Authored by: anlar
2025-11-17 00:05:44 +01:00
bashonly
4cb5e191ef [ie/youtube] Detect "super resolution" AI-upscaled formats (#15050)
Closes #14923
Authored by: bashonly
2025-11-16 22:39:22 +00:00
bashonly
6ee6a6fc58 [rh:urllib] Do not read after close (#15049)
Fix regression introduced in 5767fb4ab1

Closes #15017
Authored by: bashonly
2025-11-16 19:07:48 +00:00
bashonly
23f1ab3469 [fd] Fix playback wait time for ffmpeg downloads (#15066)
Authored by: bashonly
2025-11-16 18:15:16 +00:00
8 changed files with 341 additions and 43 deletions

View File

@@ -755,6 +755,17 @@ def test_partial_read_then_full_read(self, handler):
assert res.read(0) == b'' assert res.read(0) == b''
assert res.read() == b'<video src="/vid.mp4" /></html>' assert res.read() == b'<video src="/vid.mp4" /></html>'
def test_partial_read_greater_than_response_then_full_read(self, handler):
with handler() as rh:
for encoding in ('', 'gzip', 'deflate'):
res = validate_and_send(rh, Request(
f'http://127.0.0.1:{self.http_port}/content-encoding',
headers={'ytdl-encoding': encoding}))
assert res.headers.get('Content-Encoding') == encoding
assert res.read(512) == b'<html><video src="/vid.mp4" /></html>'
assert res.read(0) == b''
assert res.read() == b''
@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True)
@pytest.mark.handler_flaky('CurlCFFI', reason='segfaults') @pytest.mark.handler_flaky('CurlCFFI', reason='segfaults')
@@ -920,6 +931,28 @@ def test_http_response_auto_close(self, handler):
assert res.fp.fp is None assert res.fp.fp is None
assert res.closed assert res.closed
def test_data_uri_partial_read_then_full_read(self, handler):
with handler() as rh:
res = validate_and_send(rh, Request('data:text/plain,hello%20world'))
assert res.read(6) == b'hello '
assert res.read(0) == b''
assert res.read() == b'world'
# Should automatically close the underlying file object
assert res.fp.closed
assert res.closed
def test_data_uri_partial_read_greater_than_response_then_full_read(self, handler):
with handler() as rh:
res = validate_and_send(rh, Request('data:text/plain,hello%20world'))
assert res.read(512) == b'hello world'
# Response and its underlying file object should already be closed now
assert res.fp.closed
assert res.closed
assert res.read(0) == b''
assert res.read() == b''
assert res.fp.closed
assert res.closed
def test_http_error_returns_content(self, handler): def test_http_error_returns_content(self, handler):
# urllib HTTPError will try close the underlying response if reference to the HTTPError object is lost # urllib HTTPError will try close the underlying response if reference to the HTTPError object is lost
def get_response(): def get_response():

View File

@@ -40,7 +40,7 @@
pytestmark = pytest.mark.handler_flaky( pytestmark = pytest.mark.handler_flaky(
'Websockets', 'Websockets',
os.name != 'nt' and sys.implementation.name == 'pypy', os.name == 'nt' or sys.implementation.name == 'pypy',
reason='segfaults', reason='segfaults',
) )

View File

@@ -461,7 +461,8 @@ def download(self, filename, info_dict, subtitle=False):
min_sleep_interval = self.params.get('sleep_interval') or 0 min_sleep_interval = self.params.get('sleep_interval') or 0
max_sleep_interval = self.params.get('max_sleep_interval') or 0 max_sleep_interval = self.params.get('max_sleep_interval') or 0
if available_at := info_dict.get('available_at'): requested_formats = info_dict.get('requested_formats') or [info_dict]
if available_at := max(f.get('available_at') or 0 for f in requested_formats):
forced_sleep_interval = available_at - int(time.time()) forced_sleep_interval = available_at - int(time.time())
if forced_sleep_interval > min_sleep_interval: if forced_sleep_interval > min_sleep_interval:
sleep_note = 'as required by the site' sleep_note = 'as required by the site'

View File

@@ -691,6 +691,10 @@
FrontendMastersIE, FrontendMastersIE,
FrontendMastersLessonIE, FrontendMastersLessonIE,
) )
from .frontro import (
TheChosenGroupIE,
TheChosenIE,
)
from .fujitv import FujiTVFODPlus7IE from .fujitv import FujiTVFODPlus7IE
from .funk import FunkIE from .funk import FunkIE
from .funker530 import Funker530IE from .funker530 import Funker530IE
@@ -1094,7 +1098,10 @@
from .massengeschmacktv import MassengeschmackTVIE from .massengeschmacktv import MassengeschmackTVIE
from .masters import MastersIE from .masters import MastersIE
from .matchtv import MatchTVIE from .matchtv import MatchTVIE
from .mave import MaveIE from .mave import (
MaveChannelIE,
MaveIE,
)
from .mbn import MBNIE from .mbn import MBNIE
from .mdr import MDRIE from .mdr import MDRIE
from .medaltv import MedalTVIE from .medaltv import MedalTVIE

164
yt_dlp/extractor/frontro.py Normal file
View File

@@ -0,0 +1,164 @@
import json
from .common import InfoExtractor
from ..utils import int_or_none, parse_iso8601, url_or_none
from ..utils.traversal import traverse_obj
class FrontoBaseIE(InfoExtractor):
def _get_auth_headers(self, url):
return traverse_obj(self._get_cookies(url), {
'authorization': ('frAccessToken', 'value', {lambda token: f'Bearer {token}' if token else None}),
})
class FrontroVideoBaseIE(FrontoBaseIE):
_CHANNEL_ID = None
def _real_extract(self, url):
video_id = self._match_id(url)
metadata = self._download_json(
'https://api.frontrow.cc/query', video_id, data=json.dumps({
'operationName': 'Video',
'variables': {'channelID': self._CHANNEL_ID, 'videoID': video_id},
'query': '''query Video($channelID: ID!, $videoID: ID!) {
video(ChannelID: $channelID, VideoID: $videoID) {
... on Video {title description updatedAt thumbnail createdAt duration likeCount comments views url hasAccess}
}
}''',
}).encode(), headers={
'content-type': 'application/json',
**self._get_auth_headers(url),
})['data']['video']
if not traverse_obj(metadata, 'hasAccess'):
self.raise_login_required()
formats, subtitles = self._extract_m3u8_formats_and_subtitles(metadata['url'], video_id)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(metadata, {
'title': ('title', {str}),
'description': ('description', {str}),
'thumbnail': ('thumbnail', {url_or_none}),
'timestamp': ('createdAt', {parse_iso8601}),
'modified_timestamp': ('updatedAt', {parse_iso8601}),
'duration': ('duration', {int_or_none}),
'like_count': ('likeCount', {int_or_none}),
'comment_count': ('comments', {int_or_none}),
'view_count': ('views', {int_or_none}),
}),
}
class FrontroGroupBaseIE(FrontoBaseIE):
_CHANNEL_ID = None
_VIDEO_EXTRACTOR = None
_VIDEO_URL_TMPL = None
def _real_extract(self, url):
group_id = self._match_id(url)
metadata = self._download_json(
'https://api.frontrow.cc/query', group_id, note='Downloading playlist metadata',
data=json.dumps({
'operationName': 'PaginatedStaticPageContainer',
'variables': {'channelID': self._CHANNEL_ID, 'first': 500, 'pageContainerID': group_id},
'query': '''query PaginatedStaticPageContainer($channelID: ID!, $pageContainerID: ID!) {
pageContainer(ChannelID: $channelID, PageContainerID: $pageContainerID) {
... on StaticPageContainer { id title updatedAt createdAt itemRefs {edges {node {
id contentItem { ... on ItemVideo { videoItem: item {
id
}}}
}}}
}
}
}''',
}).encode(), headers={
'content-type': 'application/json',
**self._get_auth_headers(url),
})['data']['pageContainer']
entries = []
for video_id in traverse_obj(metadata, (
'itemRefs', 'edges', ..., 'node', 'contentItem', 'videoItem', 'id', {str}),
):
entries.append(self.url_result(
self._VIDEO_URL_TMPL % video_id, self._VIDEO_EXTRACTOR, video_id))
return {
'_type': 'playlist',
'id': group_id,
'entries': entries,
**traverse_obj(metadata, {
'title': ('title', {str}),
'timestamp': ('createdAt', {parse_iso8601}),
'modified_timestamp': ('updatedAt', {parse_iso8601}),
}),
}
class TheChosenIE(FrontroVideoBaseIE):
_CHANNEL_ID = '12884901895'
_VALID_URL = r'https?://(?:www\.)?watch\.thechosen\.tv/video/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://watch.thechosen.tv/video/184683594325',
'md5': '3f878b689588c71b38ec9943c54ff5b0',
'info_dict': {
'id': '184683594325',
'ext': 'mp4',
'title': 'Season 3 Episode 2: Two by Two',
'description': 'md5:174c373756ecc8df46b403f4fcfbaf8c',
'comment_count': int,
'view_count': int,
'like_count': int,
'duration': 4212,
'thumbnail': r're:https://fastly\.frontrowcdn\.com/channels/12884901895/VIDEO_THUMBNAIL/184683594325/',
'timestamp': 1698954546,
'upload_date': '20231102',
'modified_timestamp': int,
'modified_date': str,
},
}, {
'url': 'https://watch.thechosen.tv/video/184683596189',
'md5': 'd581562f9d29ce82f5b7770415334151',
'info_dict': {
'id': '184683596189',
'ext': 'mp4',
'title': 'Season 4 Episode 8: Humble',
'description': 'md5:20a57bead43da1cf77cd5b0fe29bbc76',
'comment_count': int,
'view_count': int,
'like_count': int,
'duration': 5092,
'thumbnail': r're:https://fastly\.frontrowcdn\.com/channels/12884901895/VIDEO_THUMBNAIL/184683596189/',
'timestamp': 1715019474,
'upload_date': '20240506',
'modified_timestamp': int,
'modified_date': str,
},
}]
class TheChosenGroupIE(FrontroGroupBaseIE):
_CHANNEL_ID = '12884901895'
_VIDEO_EXTRACTOR = TheChosenIE
_VIDEO_URL_TMPL = 'https://watch.thechosen.tv/video/%s'
_VALID_URL = r'https?://(?:www\.)?watch\.thechosen\.tv/group/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://watch.thechosen.tv/group/309237658592',
'info_dict': {
'id': '309237658592',
'title': 'Season 3',
'timestamp': 1746203969,
'upload_date': '20250502',
'modified_timestamp': int,
'modified_date': str,
},
'playlist_count': 8,
}]

View File

@@ -1,7 +1,9 @@
import re import functools
import math
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
InAdvancePagedList,
clean_html, clean_html,
int_or_none, int_or_none,
parse_iso8601, parse_iso8601,
@@ -10,15 +12,64 @@
from ..utils.traversal import require, traverse_obj from ..utils.traversal import require, traverse_obj
class MaveIE(InfoExtractor): class MaveBaseIE(InfoExtractor):
_VALID_URL = r'https?://(?P<channel>[\w-]+)\.mave\.digital/(?P<id>ep-\d+)' _API_BASE_URL = 'https://api.mave.digital/v1/website'
_API_BASE_STORAGE_URL = 'https://store.cloud.mts.ru/mave/'
def _load_channel_meta(self, channel_id, display_id):
return traverse_obj(self._download_json(
f'{self._API_BASE_URL}/{channel_id}/', display_id,
note='Downloading channel metadata'), 'podcast')
def _load_episode_meta(self, channel_id, episode_code, display_id):
return self._download_json(
f'{self._API_BASE_URL}/{channel_id}/episodes/{episode_code}',
display_id, note='Downloading episode metadata')
def _create_entry(self, channel_id, channel_meta, episode_meta):
episode_code = traverse_obj(episode_meta, ('code', {int}, {require('episode code')}))
return {
'display_id': f'{channel_id}-{episode_code}',
'extractor_key': MaveIE.ie_key(),
'extractor': MaveIE.IE_NAME,
'webpage_url': f'https://{channel_id}.mave.digital/ep-{episode_code}',
'channel_id': channel_id,
'channel_url': f'https://{channel_id}.mave.digital/',
'vcodec': 'none',
**traverse_obj(episode_meta, {
'id': ('id', {str}),
'url': ('audio', {urljoin(self._API_BASE_STORAGE_URL)}),
'title': ('title', {str}),
'description': ('description', {clean_html}),
'thumbnail': ('image', {urljoin(self._API_BASE_STORAGE_URL)}),
'duration': ('duration', {int_or_none}),
'season_number': ('season', {int_or_none}),
'episode_number': ('number', {int_or_none}),
'view_count': ('listenings', {int_or_none}),
'like_count': ('reactions', lambda _, v: v['type'] == 'like', 'count', {int_or_none}, any),
'dislike_count': ('reactions', lambda _, v: v['type'] == 'dislike', 'count', {int_or_none}, any),
'age_limit': ('is_explicit', {bool}, {lambda x: 18 if x else None}),
'timestamp': ('publish_date', {parse_iso8601}),
}),
**traverse_obj(channel_meta, {
'series_id': ('id', {str}),
'series': ('title', {str}),
'channel': ('title', {str}),
'uploader': ('author', {str}),
}),
}
class MaveIE(MaveBaseIE):
IE_NAME = 'mave'
_VALID_URL = r'https?://(?P<channel_id>[\w-]+)\.mave\.digital/ep-(?P<episode_code>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://ochenlichnoe.mave.digital/ep-25', 'url': 'https://ochenlichnoe.mave.digital/ep-25',
'md5': 'aa3e513ef588b4366df1520657cbc10c', 'md5': 'aa3e513ef588b4366df1520657cbc10c',
'info_dict': { 'info_dict': {
'id': '4035f587-914b-44b6-aa5a-d76685ad9bc2', 'id': '4035f587-914b-44b6-aa5a-d76685ad9bc2',
'ext': 'mp3', 'ext': 'mp3',
'display_id': 'ochenlichnoe-ep-25', 'display_id': 'ochenlichnoe-25',
'title': 'Между мной и миром: психология самооценки', 'title': 'Между мной и миром: психология самооценки',
'description': 'md5:4b7463baaccb6982f326bce5c700382a', 'description': 'md5:4b7463baaccb6982f326bce5c700382a',
'uploader': 'Самарский университет', 'uploader': 'Самарский университет',
@@ -45,7 +96,7 @@ class MaveIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '41898bb5-ff57-4797-9236-37a8e537aa21', 'id': '41898bb5-ff57-4797-9236-37a8e537aa21',
'ext': 'mp3', 'ext': 'mp3',
'display_id': 'budem-ep-12', 'display_id': 'budem-12',
'title': 'Екатерина Михайлова: "Горе от ума" не про женщин написана', 'title': 'Екатерина Михайлова: "Горе от ума" не про женщин написана',
'description': 'md5:fa3bdd59ee829dfaf16e3efcb13f1d19', 'description': 'md5:fa3bdd59ee829dfaf16e3efcb13f1d19',
'uploader': 'Полина Цветкова+Евгения Акопова', 'uploader': 'Полина Цветкова+Евгения Акопова',
@@ -68,40 +119,72 @@ class MaveIE(InfoExtractor):
'upload_date': '20241230', 'upload_date': '20241230',
}, },
}] }]
_API_BASE_URL = 'https://api.mave.digital/'
def _real_extract(self, url): def _real_extract(self, url):
channel_id, slug = self._match_valid_url(url).group('channel', 'id') channel_id, episode_code = self._match_valid_url(url).group(
display_id = f'{channel_id}-{slug}' 'channel_id', 'episode_code')
webpage = self._download_webpage(url, display_id) display_id = f'{channel_id}-{episode_code}'
data = traverse_obj(
self._search_nuxt_json(webpage, display_id), channel_meta = self._load_channel_meta(channel_id, display_id)
('data', lambda _, v: v['activeEpisodeData'], any, {require('podcast data')})) episode_meta = self._load_episode_meta(channel_id, episode_code, display_id)
return self._create_entry(channel_id, channel_meta, episode_meta)
class MaveChannelIE(MaveBaseIE):
IE_NAME = 'mave:channel'
_VALID_URL = r'https?://(?P<id>[\w-]+)\.mave\.digital/?(?:$|[?#])'
_TESTS = [{
'url': 'https://budem.mave.digital/',
'info_dict': {
'id': 'budem',
'title': 'Все там будем',
'description': 'md5:f04ae12a42be0f1d765c5e326b41987a',
},
'playlist_mincount': 15,
}, {
'url': 'https://ochenlichnoe.mave.digital/',
'info_dict': {
'id': 'ochenlichnoe',
'title': 'Очень личное',
'description': 'md5:ee36a6a52546b91b487fe08c552fdbb2',
},
'playlist_mincount': 20,
}, {
'url': 'https://geekcity.mave.digital/',
'info_dict': {
'id': 'geekcity',
'title': 'Мужчины в трико',
'description': 'md5:4164d425d60a0d97abdce9d1f6f8e049',
},
'playlist_mincount': 80,
}]
_PAGE_SIZE = 50
def _entries(self, channel_id, channel_meta, page_num):
page_data = self._download_json(
f'{self._API_BASE_URL}/{channel_id}/episodes', channel_id, query={
'view': 'all',
'page': page_num + 1,
'sort': 'newest',
'format': 'all',
}, note=f'Downloading page {page_num + 1}')
for ep in traverse_obj(page_data, ('episodes', lambda _, v: v['audio'] and v['id'])):
yield self._create_entry(channel_id, channel_meta, ep)
def _real_extract(self, url):
channel_id = self._match_id(url)
channel_meta = self._load_channel_meta(channel_id, channel_id)
return { return {
'display_id': display_id, '_type': 'playlist',
'channel_id': channel_id, 'id': channel_id,
'channel_url': f'https://{channel_id}.mave.digital/', **traverse_obj(channel_meta, {
'vcodec': 'none',
'thumbnail': re.sub(r'_\d+(?=\.(?:jpg|png))', '', self._og_search_thumbnail(webpage, default='')) or None,
**traverse_obj(data, ('activeEpisodeData', {
'url': ('audio', {urljoin(self._API_BASE_URL)}),
'id': ('id', {str}),
'title': ('title', {str}), 'title': ('title', {str}),
'description': ('description', {clean_html}), 'description': ('description', {str}),
'duration': ('duration', {int_or_none}), }),
'season_number': ('season', {int_or_none}), 'entries': InAdvancePagedList(
'episode_number': ('number', {int_or_none}), functools.partial(self._entries, channel_id, channel_meta),
'view_count': ('listenings', {int_or_none}), math.ceil(channel_meta['episodes_count'] / self._PAGE_SIZE), self._PAGE_SIZE),
'like_count': ('reactions', lambda _, v: v['type'] == 'like', 'count', {int_or_none}, any),
'dislike_count': ('reactions', lambda _, v: v['type'] == 'dislike', 'count', {int_or_none}, any),
'age_limit': ('is_explicit', {bool}, {lambda x: 18 if x else None}),
'timestamp': ('publish_date', {parse_iso8601}),
})),
**traverse_obj(data, ('podcast', 'podcast', {
'series_id': ('id', {str}),
'series': ('title', {str}),
'channel': ('title', {str}),
'uploader': ('author', {str}),
})),
} }

View File

@@ -3150,6 +3150,9 @@ def _extract_formats_and_subtitles(self, video_id, player_responses, player_url,
self._downloader.deprecated_feature('[youtube] include_duplicate_formats extractor argument is deprecated. ' self._downloader.deprecated_feature('[youtube] include_duplicate_formats extractor argument is deprecated. '
'Use formats=duplicate extractor argument instead') 'Use formats=duplicate extractor argument instead')
def is_super_resolution(f_url):
return '1' in traverse_obj(f_url, ({parse_qs}, 'xtags', ..., {urllib.parse.parse_qs}, 'sr', ...))
def solve_sig(s, spec): def solve_sig(s, spec):
return ''.join(s[i] for i in spec) return ''.join(s[i] for i in spec)
@@ -3202,7 +3205,7 @@ def get_language_code_and_preference(fmt_stream):
def get_stream_id(fmt_stream): def get_stream_id(fmt_stream):
return str_or_none(fmt_stream.get('itag')), traverse_obj(fmt_stream, 'audioTrack', 'id'), fmt_stream.get('isDrc') return str_or_none(fmt_stream.get('itag')), traverse_obj(fmt_stream, 'audioTrack', 'id'), fmt_stream.get('isDrc')
def process_format_stream(fmt_stream, proto, missing_pot): def process_format_stream(fmt_stream, proto, missing_pot, super_resolution=False):
itag = str_or_none(fmt_stream.get('itag')) itag = str_or_none(fmt_stream.get('itag'))
audio_track = fmt_stream.get('audioTrack') or {} audio_track = fmt_stream.get('audioTrack') or {}
quality = fmt_stream.get('quality') quality = fmt_stream.get('quality')
@@ -3253,10 +3256,13 @@ def process_format_stream(fmt_stream, proto, missing_pot):
dct = { dct = {
'asr': int_or_none(fmt_stream.get('audioSampleRate')), 'asr': int_or_none(fmt_stream.get('audioSampleRate')),
'filesize': int_or_none(fmt_stream.get('contentLength')), 'filesize': int_or_none(fmt_stream.get('contentLength')),
'format_id': f'{itag}{"-drc" if fmt_stream.get("isDrc") else ""}', 'format_id': join_nonempty(itag, (
'drc' if fmt_stream.get('isDrc')
else 'sr' if super_resolution
else None)),
'format_note': join_nonempty( 'format_note': join_nonempty(
join_nonempty(audio_track.get('displayName'), audio_track.get('audioIsDefault') and '(default)', delim=' '), join_nonempty(audio_track.get('displayName'), audio_track.get('audioIsDefault') and '(default)', delim=' '),
name, fmt_stream.get('isDrc') and 'DRC', name, fmt_stream.get('isDrc') and 'DRC', super_resolution and 'AI-upscaled',
try_get(fmt_stream, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt_stream, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
try_get(fmt_stream, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), try_get(fmt_stream, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
is_damaged and 'DAMAGED', missing_pot and 'MISSING POT', is_damaged and 'DAMAGED', missing_pot and 'MISSING POT',
@@ -3342,7 +3348,9 @@ def process_https_formats():
self.report_warning(msg, video_id, only_once=True) self.report_warning(msg, video_id, only_once=True)
continue continue
fmt = process_format_stream(fmt_stream, proto, missing_pot=require_po_token and not po_token) fmt = process_format_stream(
fmt_stream, proto, missing_pot=require_po_token and not po_token,
super_resolution=is_super_resolution(fmt_url))
if not fmt: if not fmt:
continue continue

View File

@@ -305,6 +305,8 @@ def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None)) status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
def read(self, amt=None): def read(self, amt=None):
if self.closed:
return b''
try: try:
data = self.fp.read(amt) data = self.fp.read(amt)
underlying = getattr(self.fp, 'fp', None) underlying = getattr(self.fp, 'fp', None)