[npo] Fix extraction (#20084)

2019-03-01 00:47:18 +07:00 · 2019-03-01 00:47:18 +07:00 · ff60ec8f02
parent 9d9a8676dc
commit ff60ec8f02
1 changed files with 117 additions and 3 deletions
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@ -12,11 +12,16 @@ from ..utils import (
    ExtractorError,
    fix_xml_ampersands,
    int_or_none,
    merge_dicts,
    orderedSet,
    parse_duration,
    qualities,
    str_or_none,
    strip_jsonp,
    unified_strdate,
    unified_timestamp,
    url_or_none,
    urlencode_postdata,
 )
@ -176,9 +181,118 @@ class NPOIE(NPOBaseIE):
    def _real_extract(self, url):
        video_id = self._match_id(url)
-        return self._get_info(video_id)
+        try:
            return self._get_info(url, video_id)
        except ExtractorError:
            return self._get_old_info(video_id)
-    def _get_info(self, video_id):
+    def _get_info(self, url, video_id):
        token = self._download_json(
            'https://www.npostart.nl/api/token', video_id,
            'Downloading token', headers={
                'Referer': url,
                'X-Requested-With': 'XMLHttpRequest',
            })['token']
        player = self._download_json(
            'https://www.npostart.nl/player/%s' % video_id, video_id,
            'Downloading player JSON', data=urlencode_postdata({
                'autoplay': 0,
                'share': 1,
                'pageUrl': url,
                'hasAdConsent': 0,
                '_token': token,
            }))
        player_token = player['token']
        format_urls = set()
        formats = []
        for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'):
            streams = self._download_json(
                'https://start-player.npo.nl/video/%s/streams' % video_id,
                video_id, 'Downloading %s profile JSON' % profile, fatal=False,
                query={
                    'profile': profile,
                    'quality': 'npo',
                    'tokenId': player_token,
                    'streamType': 'broadcast',
                })
            if not streams:
                continue
            stream = streams.get('stream')
            if not isinstance(stream, dict):
                continue
            stream_url = url_or_none(stream.get('src'))
            if not stream_url or stream_url in format_urls:
                continue
            format_urls.add(stream_url)
            if stream.get('protection') is not None:
                continue
            stream_type = stream.get('type')
            stream_ext = determine_ext(stream_url)
            if stream_type == 'application/dash+xml' or stream_ext == 'mpd':
                formats.extend(self._extract_mpd_formats(
                    stream_url, video_id, mpd_id='dash', fatal=False))
            elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    stream_url, video_id, ext='mp4',
                    entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
            elif '.ism/Manifest' in stream_url:
                formats.extend(self._extract_ism_formats(
                    stream_url, video_id, ism_id='mss', fatal=False))
            else:
                formats.append({
                    'url': stream_url,
                })
        self._sort_formats(formats)
        info = {
            'id': video_id,
            'title': video_id,
            'formats': formats,
        }
        embed_url = url_or_none(player.get('embedUrl'))
        if embed_url:
            webpage = self._download_webpage(
                embed_url, video_id, 'Downloading embed page', fatal=False)
            if webpage:
                video = self._parse_json(
                    self._search_regex(
                        r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video',
                        default='{}'), video_id)
                if video:
                    title = video.get('episodeTitle')
                    subtitles = {}
                    subtitles_list = video.get('subtitles')
                    if isinstance(subtitles_list, list):
                        for cc in subtitles_list:
                            cc_url = url_or_none(cc.get('src'))
                            if not cc_url:
                                continue
                            lang = str_or_none(cc.get('language')) or 'nl'
                            subtitles.setdefault(lang, []).append({
                                'url': cc_url,
                            })
                    return merge_dicts({
                        'title': title,
                        'description': video.get('description'),
                        'thumbnail': url_or_none(
                            video.get('still_image_url') or video.get('orig_image_url')),
                        'duration': int_or_none(video.get('duration')),
                        'timestamp': unified_timestamp(video.get('broadcastDate')),
                        'creator': video.get('channel'),
                        'series': video.get('title'),
                        'episode': title,
                        'episode_number': int_or_none(video.get('episodeNumber')),
                        'subtitles': subtitles,
                    }, info)
        return info
    def _get_old_info(self, video_id):
        metadata = self._download_json(
            'http://e.omroep.nl/metadata/%s' % video_id,
            video_id,
@ -280,7 +394,7 @@ class NPOIE(NPOBaseIE):
            # JSON
            else:
                video_url = stream_info.get('url')
-            if not video_url or video_url in urls:
+            if not video_url or 'vodnotavailable.' in video_url or video_url in urls:
                continue
            urls.add(video_url)
            if determine_ext(video_url) == 'm3u8':