[YouTube] Support JSON3 subtitle format

* subtitle tests updated to match
pull/30838/head
dirkf 2022-06-09 15:25:23 +01:00
parent 3aa94d7945
commit 811c480f7b
2 changed files with 55 additions and 21 deletions

View File

@ -59,6 +59,21 @@ class BaseTestSubtitles(unittest.TestCase):
class TestYoutubeSubtitles(BaseTestSubtitles): class TestYoutubeSubtitles(BaseTestSubtitles):
# Available subtitles for QRS8MkLhQmM:
# Language formats
# ru vtt, ttml, srv3, srv2, srv1, json3
# fr vtt, ttml, srv3, srv2, srv1, json3
# en vtt, ttml, srv3, srv2, srv1, json3
# nl vtt, ttml, srv3, srv2, srv1, json3
# de vtt, ttml, srv3, srv2, srv1, json3
# ko vtt, ttml, srv3, srv2, srv1, json3
# it vtt, ttml, srv3, srv2, srv1, json3
# zh-Hant vtt, ttml, srv3, srv2, srv1, json3
# hi vtt, ttml, srv3, srv2, srv1, json3
# pt-BR vtt, ttml, srv3, srv2, srv1, json3
# es-MX vtt, ttml, srv3, srv2, srv1, json3
# ja vtt, ttml, srv3, srv2, srv1, json3
# pl vtt, ttml, srv3, srv2, srv1, json3
url = 'QRS8MkLhQmM' url = 'QRS8MkLhQmM'
IE = YoutubeIE IE = YoutubeIE
@ -67,41 +82,60 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles.keys()), 13) self.assertEqual(len(subtitles.keys()), 13)
self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') self.assertEqual(md5(subtitles['en']), 'ae1bd34126571a77aabd4d276b28044d')
self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5') self.assertEqual(md5(subtitles['it']), '0e0b667ba68411d88fd1c5f4f4eab2f9')
for lang in ['fr', 'de']: for lang in ['fr', 'de']:
self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
def test_youtube_subtitles_ttml_format(self): def _test_subtitles_format(self, fmt, md5_hash, lang='en'):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['subtitlesformat'] = 'ttml' self.DL.params['subtitlesformat'] = fmt
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54') self.assertEqual(md5(subtitles[lang]), md5_hash)
def test_youtube_subtitles_ttml_format(self):
self._test_subtitles_format('ttml', 'c97ddf1217390906fa9fbd34901f3da2')
def test_youtube_subtitles_vtt_format(self): def test_youtube_subtitles_vtt_format(self):
self.DL.params['writesubtitles'] = True self._test_subtitles_format('vtt', 'ae1bd34126571a77aabd4d276b28044d')
self.DL.params['subtitlesformat'] = 'vtt'
def test_youtube_subtitles_json3_format(self):
self._test_subtitles_format('json3', '688dd1ce0981683867e7fe6fde2a224b')
def _test_automatic_captions(self, url, lang):
self.url = url
self.DL.params['writeautomaticsub'] = True
self.DL.params['subtitleslangs'] = [lang]
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') self.assertTrue(subtitles[lang] is not None)
def test_youtube_automatic_captions(self): def test_youtube_automatic_captions(self):
self.url = '8YoUxe5ncPo' # Available automatic captions for 8YoUxe5ncPo:
self.DL.params['writeautomaticsub'] = True # Language formats (all in vtt, ttml, srv3, srv2, srv1, json3)
self.DL.params['subtitleslangs'] = ['it'] # gu, zh-Hans, zh-Hant, gd, ga, gl, lb, la, lo, tt, tr,
subtitles = self.getSubtitles() # lv, lt, tk, th, tg, te, fil, haw, yi, ceb, yo, de, da,
self.assertTrue(subtitles['it'] is not None) # el, eo, en, eu, et, es, ru, rw, ro, bn, be, bg, uk, jv,
# bs, ja, or, xh, co, ca, cy, cs, ps, pt, pa, vi, pl, hy,
# hr, ht, hu, hmn, hi, ha, mg, uz, ml, mn, mi, mk, ur,
# mt, ms, mr, ug, ta, my, af, sw, is, am,
# *it*, iw, sv, ar,
# su, zu, az, id, ig, nl, no, ne, ny, fr, ku, fy, fa, fi,
# ka, kk, sr, sq, ko, kn, km, st, sk, si, so, sn, sm, sl,
# ky, sd
# ...
self._test_automatic_captions('8YoUxe5ncPo', 'it')
@unittest.skip('ASR subs all in all supported langs now')
def test_youtube_translated_subtitles(self): def test_youtube_translated_subtitles(self):
# This video has a subtitles track, which can be translated # This video has a subtitles track, which can be translated (#4555)
self.url = 'Ky9eprVWzlI' self._test_automatic_captions('Ky9eprVWzlI', 'it')
self.DL.params['writeautomaticsub'] = True
self.DL.params['subtitleslangs'] = ['it']
subtitles = self.getSubtitles()
self.assertTrue(subtitles['it'] is not None)
def test_youtube_nosubtitles(self): def test_youtube_nosubtitles(self):
self.DL.expect_warning('video doesn\'t have subtitles') self.DL.expect_warning('video doesn\'t have subtitles')
self.url = 'n5BB19UTcdA' # Available automatic captions for 8YoUxe5ncPo:
# ...
# 8YoUxe5ncPo has no subtitles
self.url = '8YoUxe5ncPo'
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()

View File

@ -499,7 +499,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
) )
_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
_GEO_BYPASS = False _GEO_BYPASS = False