[npo] Fix extraction (#20084)

pull/20235/head
Sergey M․ 2019-03-01 00:47:18 +07:00
parent 9d9a8676dc
commit ff60ec8f02
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
1 changed files with 117 additions and 3 deletions

View File

@ -12,11 +12,16 @@ from ..utils import (
ExtractorError, ExtractorError,
fix_xml_ampersands, fix_xml_ampersands,
int_or_none, int_or_none,
merge_dicts,
orderedSet, orderedSet,
parse_duration, parse_duration,
qualities, qualities,
str_or_none,
strip_jsonp, strip_jsonp,
unified_strdate, unified_strdate,
unified_timestamp,
url_or_none,
urlencode_postdata,
) )
@ -176,9 +181,118 @@ class NPOIE(NPOBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
return self._get_info(video_id) try:
return self._get_info(url, video_id)
except ExtractorError:
return self._get_old_info(video_id)
def _get_info(self, video_id): def _get_info(self, url, video_id):
token = self._download_json(
'https://www.npostart.nl/api/token', video_id,
'Downloading token', headers={
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
})['token']
player = self._download_json(
'https://www.npostart.nl/player/%s' % video_id, video_id,
'Downloading player JSON', data=urlencode_postdata({
'autoplay': 0,
'share': 1,
'pageUrl': url,
'hasAdConsent': 0,
'_token': token,
}))
player_token = player['token']
format_urls = set()
formats = []
for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'):
streams = self._download_json(
'https://start-player.npo.nl/video/%s/streams' % video_id,
video_id, 'Downloading %s profile JSON' % profile, fatal=False,
query={
'profile': profile,
'quality': 'npo',
'tokenId': player_token,
'streamType': 'broadcast',
})
if not streams:
continue
stream = streams.get('stream')
if not isinstance(stream, dict):
continue
stream_url = url_or_none(stream.get('src'))
if not stream_url or stream_url in format_urls:
continue
format_urls.add(stream_url)
if stream.get('protection') is not None:
continue
stream_type = stream.get('type')
stream_ext = determine_ext(stream_url)
if stream_type == 'application/dash+xml' or stream_ext == 'mpd':
formats.extend(self._extract_mpd_formats(
stream_url, video_id, mpd_id='dash', fatal=False))
elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
stream_url, video_id, ext='mp4',
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
elif '.ism/Manifest' in stream_url:
formats.extend(self._extract_ism_formats(
stream_url, video_id, ism_id='mss', fatal=False))
else:
formats.append({
'url': stream_url,
})
self._sort_formats(formats)
info = {
'id': video_id,
'title': video_id,
'formats': formats,
}
embed_url = url_or_none(player.get('embedUrl'))
if embed_url:
webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed page', fatal=False)
if webpage:
video = self._parse_json(
self._search_regex(
r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video',
default='{}'), video_id)
if video:
title = video.get('episodeTitle')
subtitles = {}
subtitles_list = video.get('subtitles')
if isinstance(subtitles_list, list):
for cc in subtitles_list:
cc_url = url_or_none(cc.get('src'))
if not cc_url:
continue
lang = str_or_none(cc.get('language')) or 'nl'
subtitles.setdefault(lang, []).append({
'url': cc_url,
})
return merge_dicts({
'title': title,
'description': video.get('description'),
'thumbnail': url_or_none(
video.get('still_image_url') or video.get('orig_image_url')),
'duration': int_or_none(video.get('duration')),
'timestamp': unified_timestamp(video.get('broadcastDate')),
'creator': video.get('channel'),
'series': video.get('title'),
'episode': title,
'episode_number': int_or_none(video.get('episodeNumber')),
'subtitles': subtitles,
}, info)
return info
def _get_old_info(self, video_id):
metadata = self._download_json( metadata = self._download_json(
'http://e.omroep.nl/metadata/%s' % video_id, 'http://e.omroep.nl/metadata/%s' % video_id,
video_id, video_id,
@ -280,7 +394,7 @@ class NPOIE(NPOBaseIE):
# JSON # JSON
else: else:
video_url = stream_info.get('url') video_url = stream_info.get('url')
if not video_url or video_url in urls: if not video_url or 'vodnotavailable.' in video_url or video_url in urls:
continue continue
urls.add(video_url) urls.add(video_url)
if determine_ext(video_url) == 'm3u8': if determine_ext(video_url) == 'm3u8':