[bbc] Add support for direct bbc.co.uk embeds

2015-07-27 22:05:51 +06:00 · 2015-07-27 22:05:51 +06:00 · 88ed52aec9
parent 4c6bd5b5b6
commit 88ed52aec9
2 changed files with 30 additions and 12 deletions
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@ -450,6 +450,14 @@ class BBCIE(BBCCoUkIE):
        },
        'playlist_count': 9,
        'skip': 'Save time',
    }, {
        # article with multiple videos embedded with `new SMP()`
        'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
        'info_dict': {
            'id': '3662a707-0af9-3149-963f-47bea720b460',
            'title': 'BBC Blogs - Adam Curtis - BUGGER',
        },
        'playlist_count': 18,
    }, {
        # single video embedded with mediaAssetPage.init()
        'url': 'http://www.bbc.com/news/world-europe-32041533',
@ -637,12 +645,30 @@ class BBCIE(BBCCoUkIE):
        playlist_title = self._html_search_regex(
            r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
-        playlist_description = self._og_search_description(webpage)
+        playlist_description = self._og_search_description(webpage, default=None)
        def extract_all(pattern):
            return list(filter(None, map(
                lambda s: self._parse_json(s, playlist_id, fatal=False),
                re.findall(pattern, webpage))))
        # Multiple video article (e.g.
        # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]*)?'
        entries = []
        for match in extract_all(r'new\s+SMP\(({.+?})\)'):
            embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
            if embed_url and re.match(EMBED_URL, embed_url):
                entries.append(embed_url)
        entries.extend(re.findall(
            r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
        if entries:
            return self.playlist_result(
                [self.url_result(entry, 'BBCCoUk') for entry in entries],
                playlist_id, playlist_title, playlist_description)
        # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
-        medias = list(filter(None, map(
+        medias = extract_all(r"data-media-meta='({[^']+})'")
            lambda s: self._parse_json(s, playlist_id, fatal=False),
            re.findall(r"data-media-meta='({[^']+})'", webpage))))
        if not medias:
            # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -276,14 +276,6 @@ class GenericIE(InfoExtractor):
                'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
            },
        },
        # BBC iPlayer embeds
        {
            'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
            'info_dict': {
                'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
            },
            'playlist_mincount': 18,
        },
        # RUTV embed
        {
            'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',