From 557194591a3afcf16f2d554aa0af34ce83ca163f Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Mon, 10 Apr 2017 01:09:10 -0400 Subject: [PATCH] [washingtonpost] Add support for embeds (closes #12699) --- youtube_dl/extractor/generic.py | 21 +++++++++++++++++++++ youtube_dl/extractor/washingtonpost.py | 6 ++++++ 2 files changed, 27 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7f7c1ba29..bd9d9aa13 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -87,6 +87,7 @@ from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE from .anvato import AnvatoIE +from .washingtonpost import WashingtonPostIE class GenericIE(InfoExtractor): @@ -1687,6 +1688,20 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 4, }, + { + # WashingtonPost embed + 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches', + 'info_dict': { + 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac', + 'ext': 'mp4', + 'title': "No one has seen the drama series based on Trump's life \u2014 until now", + 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.', + 'timestamp': 1455216756, + 'uploader': 'The Washington Post', + 'upload_date': '20160211', + }, + 'add_ie': [WashingtonPostIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2670,6 +2685,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rutube_urls, ie=RutubeIE.ie_key()) + # Look for WashingtonPost embeds + wapo_urls = WashingtonPostIE._extract_urls(webpage) + if wapo_urls: + return self.playlist_from_matches( + wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 839cad986..7e5cf0e1d 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -13,6 +13,7 @@ from ..utils import ( class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_URL = 'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _TEST = { 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', @@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor): }, } + @classmethod + def _extract_urls(cls, webpage): + return re.findall( + r']+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage) + def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json(