[bbc] Improve title and description extraction (Closes #8826, closes #8822)

pull/8845/merge
Sergey M․ 2016-03-13 15:54:56 +06:00
parent 8e4aa7bf18
commit 0e832c2c97
1 changed files with 18 additions and 9 deletions

View File

@ -563,6 +563,14 @@ class BBCIE(BBCCoUkIE):
'title': 'BBC Blogs - Adam Curtis - BUGGER', 'title': 'BBC Blogs - Adam Curtis - BUGGER',
}, },
'playlist_count': 18, 'playlist_count': 18,
}, {
# school report playlist with single video
'url': 'http://www.bbc.co.uk/schoolreport/35744779',
'info_dict': {
'id': '35744779',
'title': 'School which breaks down barriers in Jerusalem',
},
'playlist_count': 1,
}, { }, {
# single video embedded with data-playable containing vpid # single video embedded with data-playable containing vpid
'url': 'http://www.bbc.com/news/world-europe-32041533', 'url': 'http://www.bbc.com/news/world-europe-32041533',
@ -734,8 +742,17 @@ class BBCIE(BBCCoUkIE):
json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
timestamp = json_ld_info.get('timestamp') timestamp = json_ld_info.get('timestamp')
playlist_title = json_ld_info.get('title') playlist_title = json_ld_info.get('title')
playlist_description = json_ld_info.get('description') if not playlist_title:
playlist_title = self._og_search_title(
webpage, default=None) or self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
if playlist_title:
playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
playlist_description = json_ld_info.get(
'description') or self._og_search_description(webpage, default=None)
if not timestamp: if not timestamp:
timestamp = parse_iso8601(self._search_regex( timestamp = parse_iso8601(self._search_regex(
@ -795,14 +812,6 @@ class BBCIE(BBCCoUkIE):
entries.append(self._extract_from_playlist_sxml( entries.append(self._extract_from_playlist_sxml(
playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
playlist_title = self._og_search_title(webpage, default=None)
playlist_title = playlist_title or self._html_search_regex(
r'<title>(.*?)</title>', webpage, 'playlist title')
playlist_title = self._search_regex(r'(.+)\s*-\s*BBC', playlist_title, 'title', default=playlist_title)
playlist_description = self._og_search_description(webpage, default=None)
if entries: if entries:
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)