From 8fd12a083131550476fb771c180a0734794d0b9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Sep 2018 05:38:41 +0700 Subject: [PATCH] [mediaset] Improve embed support (closes #17668) --- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/mediaset.py | 38 +++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 76ef01332..2a48667f0 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3023,7 +3023,7 @@ class GenericIE(InfoExtractor): wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) # Look for Mediaset embeds - mediaset_urls = MediasetIE._extract_urls(webpage) + mediaset_urls = MediasetIE._extract_urls(self, webpage) if mediaset_urls: return self.playlist_from_matches( mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 57f97409d..df3748798 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -4,6 +4,11 @@ from __future__ import unicode_literals import re from .theplatform import ThePlatformBaseIE +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -76,12 +81,33 @@ class MediasetIE(ThePlatformBaseIE): }] @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r']+\bsrc=(["\'])(?Phttps?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1', - webpage)] + def _extract_urls(ie, webpage): + def _qs(url): + return compat_parse_qs(compat_urllib_parse_urlparse(url).query) + + def _program_guid(qs): + return qs.get('programGuid', [None])[0] + + entries = [] + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml.*?)\1', + webpage): + embed_url = mobj.group('url') + embed_qs = _qs(embed_url) + program_guid = _program_guid(embed_qs) + if program_guid: + entries.append(embed_url) + continue + video_id = embed_qs.get('id', [None])[0] + if not video_id: + continue + urlh = ie._request_webpage( + embed_url, video_id, note='Following embed URL redirect') + embed_url = compat_str(urlh.geturl()) + program_guid = _program_guid(_qs(embed_url)) + if program_guid: + entries.append(embed_url) + return entries def _real_extract(self, url): guid = self._match_id(url)