From 676e3ecf245f230411c9d534b0a4e918c01f5d86 Mon Sep 17 00:00:00 2001 From: peugeot Date: Sat, 30 Aug 2014 17:17:47 +0200 Subject: [PATCH 1/5] Add support for Vporn --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/vporn.py | 44 ++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 youtube_dl/extractor/vporn.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1479d998a..6d94984de 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -393,6 +393,7 @@ from .vine import ( from .viki import VikiIE from .vk import VKIE from .vodlocker import VodlockerIE +from .vporn import VpornIE from .vube import VubeIE from .vuclip import VuClipIE from .vulture import VultureIE diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py new file mode 100644 index 000000000..bdbd7543d --- /dev/null +++ b/youtube_dl/extractor/vporn.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +class VpornIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?vporn\.com/[a-z]+/(?P[a-z-]+)/(?P\d+)/?' + _TEST = { + 'url': 'http://www.vporn.com/masturbation/violet-on-her-th-birthday/497944/', + 'md5': 'facf37c1b86546fa0208058546842c55', + 'info_dict': { + 'id': '497944', + 'ext': 'mp4', + 'title': 'Violet On Her 19th Birthday', + 'description': 'Violet dances in front of the camera which is sure to get you horny.', + 'duration': 393, + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'(.*?) - Vporn Video', webpage, 'title') + video_url = self._html_search_regex(r'flashvars.videoUrlMedium = "(.*?)"', webpage, 'video_url') + description = self._html_search_regex(r'
(.*?)
', webpage, 'description') + thumbnail = 'http://www.vporn.com' + self._html_search_regex(r'flashvars.imageUrl = "(.*?)"', webpage, 'description') + + mobj = re.search( + r'duration (?P\d+) min (?P\d+) sec ', webpage) + duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None + + return { + 'id': video_id, + 'url': video_url, + 'thumbnail': thumbnail, + 'title': title, + 'description': description, + 'duration': duration, + } From 12c82cf9cb59e97186b96fea76a15b52a0a9bb37 Mon Sep 17 00:00:00 2001 From: peugeot Date: Sat, 30 Aug 2014 20:54:38 +0200 Subject: [PATCH 2/5] add support for view count --- youtube_dl/extractor/vporn.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py index bdbd7543d..0d182b650 100644 --- a/youtube_dl/extractor/vporn.py +++ b/youtube_dl/extractor/vporn.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import int_or_none class VpornIE(InfoExtractor): _VALID_URL = r'http?://(?:www\.)?vporn\.com/[a-z]+/(?P[a-z-]+)/(?P\d+)/?' @@ -30,15 +31,18 @@ class VpornIE(InfoExtractor): description = self._html_search_regex(r'
(.*?)
', webpage, 'description') thumbnail = 'http://www.vporn.com' + self._html_search_regex(r'flashvars.imageUrl = "(.*?)"', webpage, 'description') - mobj = re.search( - r'duration (?P\d+) min (?P\d+) sec ', webpage) + mobj = re.search(r'duration (?P\d+) min (?P\d+) sec ', webpage) duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None + mobj = re.search(r'((?P\d+),)?(?P\d+) VIEWS', webpage) + view_count = int(mobj.group('thousands')) * 1000 + int(mobj.group('units')) if mobj else None + return { 'id': video_id, 'url': video_url, 'thumbnail': thumbnail, 'title': title, 'description': description, - 'duration': duration, + 'duration': int_or_none(duration), + 'view_count': int_or_none(view_count), } From 9c4c233b846bd9be4c48b788d6d2347af764f15d Mon Sep 17 00:00:00 2001 From: peugeot Date: Sat, 30 Aug 2014 23:05:33 +0200 Subject: [PATCH 3/5] Fix exception with n_views<1000 --- youtube_dl/extractor/vporn.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py index 0d182b650..645e935ec 100644 --- a/youtube_dl/extractor/vporn.py +++ b/youtube_dl/extractor/vporn.py @@ -35,7 +35,11 @@ class VpornIE(InfoExtractor): duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None mobj = re.search(r'((?P\d+),)?(?P\d+) VIEWS', webpage) - view_count = int(mobj.group('thousands')) * 1000 + int(mobj.group('units')) if mobj else None + try: + view_count = int(mobj.group('units')) + view_count += int(mobj.group('thousands')) * 1000 + except: + pass return { 'id': video_id, From ca7b3246b69215c890193acaf4eab746bc19504e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 31 Aug 2014 06:41:30 +0700 Subject: [PATCH 4/5] [utils] Improve parse_duration --- test/test_utils.py | 3 +++ youtube_dl/utils.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 3d14f61fb..8d8997977 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -211,6 +211,9 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('00:01:01'), 61) self.assertEqual(parse_duration('x:y'), None) self.assertEqual(parse_duration('3h11m53s'), 11513) + self.assertEqual(parse_duration('3h 11m 53s'), 11513) + self.assertEqual(parse_duration('3 hours 11 minutes 53 seconds'), 11513) + self.assertEqual(parse_duration('3 hours 11 mins 53 secs'), 11513) self.assertEqual(parse_duration('62m45s'), 3765) self.assertEqual(parse_duration('6m59s'), 419) self.assertEqual(parse_duration('49s'), 49) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4f0210872..e07750434 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1332,8 +1332,10 @@ def parse_duration(s): if s is None: return None + s = s.strip() + m = re.match( - r'(?:(?:(?P[0-9]+)[:h])?(?P[0-9]+)[:m])?(?P[0-9]+)s?(?::[0-9]+)?(?P\.[0-9]+)?$', s) + r'(?:(?:(?P[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P[0-9]+)(?P\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s) if not m: return None res = int(m.group('secs')) From 7b53af7f70da81eae41da645cc5af2c777c5c8e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 31 Aug 2014 06:43:36 +0700 Subject: [PATCH 5/5] [vporn] Fix issues, extract all formats and metadata --- youtube_dl/extractor/vporn.py | 91 ++++++++++++++++++++++++++--------- 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py index 645e935ec..426369c51 100644 --- a/youtube_dl/extractor/vporn.py +++ b/youtube_dl/extractor/vporn.py @@ -1,52 +1,99 @@ -# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + parse_duration, + str_to_int, +) + class VpornIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?vporn\.com/[a-z]+/(?P[a-z-]+)/(?P\d+)/?' + _VALID_URL = r'https?://(?:www\.)?vporn\.com/[^/]+/(?P[^/]+)/(?P\d+)' _TEST = { 'url': 'http://www.vporn.com/masturbation/violet-on-her-th-birthday/497944/', 'md5': 'facf37c1b86546fa0208058546842c55', 'info_dict': { 'id': '497944', + 'display_id': 'violet-on-her-th-birthday', 'ext': 'mp4', - 'title': 'Violet On Her 19th Birthday', + 'title': 'Violet on her 19th birthday', 'description': 'Violet dances in front of the camera which is sure to get you horny.', - 'duration': 393, 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'kileyGrope', + 'categories': ['Masturbation', 'Teen'], + 'duration': 393, + 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + display_id = mobj.group('display_id') - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.*?) - Vporn Video', webpage, 'title') - video_url = self._html_search_regex(r'flashvars.videoUrlMedium = "(.*?)"', webpage, 'video_url') - description = self._html_search_regex(r'
(.*?)
', webpage, 'description') - thumbnail = 'http://www.vporn.com' + self._html_search_regex(r'flashvars.imageUrl = "(.*?)"', webpage, 'description') + webpage = self._download_webpage(url, display_id) - mobj = re.search(r'duration (?P\d+) min (?P\d+) sec ', webpage) - duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None + title = self._html_search_regex( + r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip() + description = self._html_search_regex( + r'
(.*?)
', webpage, 'description', fatal=False) + thumbnail = self._html_search_regex( + r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', fatal=False, default=None) + if thumbnail: + thumbnail = 'http://www.vporn.com' + thumbnail - mobj = re.search(r'((?P\d+),)?(?P\d+) VIEWS', webpage) - try: - view_count = int(mobj.group('units')) - view_count += int(mobj.group('thousands')) * 1000 - except: - pass + uploader = self._html_search_regex( + r'(?s)UPLOADED BY.*?([^<]+)', + webpage, 'uploader', fatal=False) + + categories = re.findall(r'([^<]+)', webpage) + + duration = parse_duration(self._search_regex( + r'duration (\d+ min \d+ sec)', webpage, 'duration', fatal=False)) + + view_count = str_to_int(self._html_search_regex( + r'([\d,\.]+) VIEWS', webpage, 'view count', fatal=False)) + like_count = str_to_int(self._html_search_regex( + r'([\d,\.]+)', webpage, 'like count', fatal=False)) + dislike_count = str_to_int(self._html_search_regex( + r'([\d,\.]+)', webpage, 'dislike count', fatal=False)) + comment_count = str_to_int(self._html_search_regex( + r'

Comments \(([\d,\.]+)\)

', webpage, 'comment count', fatal=False)) + + formats = [] + + for video in re.findall(r'flashvars\.videoUrl([^=]+?)\s*=\s*"([^"]+)"', webpage): + video_url = video[1] + fmt = { + 'url': video_url, + 'format_id': video[0], + } + m = re.search(r'_(?P\d+)x(?P\d+)_(?P\d+)k\.mp4$', video_url) + if m: + fmt.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'vbr': int(m.group('vbr')), + }) + formats.append(fmt) + + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, - 'thumbnail': thumbnail, + 'display_id': display_id, 'title': title, 'description': description, - 'duration': int_or_none(duration), - 'view_count': int_or_none(view_count), + 'thumbnail': thumbnail, + 'uploader': uploader, + 'categories': categories, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, + 'age_limit': 18, + 'formats': formats, }