diff options
author | bsiegert <bsiegert@pkgsrc.org> | 2020-11-24 17:16:33 +0000 |
---|---|---|
committer | bsiegert <bsiegert@pkgsrc.org> | 2020-11-24 17:16:33 +0000 |
commit | f6c71d4f8bd2f6c2fbdd51c1fad0d92df6d6f434 (patch) | |
tree | 3e30f422cc9fc95c89a4f7025a23f4fe619b9bdd | |
parent | 887ca7731d87cb28819ba454a603a04915915f8f (diff) | |
download | pkgsrc-f6c71d4f8bd2f6c2fbdd51c1fad0d92df6d6f434.tar.gz |
Pullup ticket #6367 - requested by leot
net/youtube-dl: updates and bugfixes
Revisions pulled up:
- net/youtube-dl/Makefile 1.215-1.218
- net/youtube-dl/distinfo 1.197-1.200
- net/youtube-dl/patches/patch-youtube__dl_extractor_bandcamp.py 1.1
- net/youtube-dl/patches/patch-youtube__dl_extractor_youtube.py deleted
---
Module Name: pkgsrc
Committed By: leot
Date: Sun Nov 1 10:58:24 UTC 2020
Modified Files:
pkgsrc/net/youtube-dl: Makefile distinfo
Added Files:
pkgsrc/net/youtube-dl/patches: patch-youtube__dl_extractor_bandcamp.py
Log Message:
youtube-dl: Add a patch to fix bandcamp extractor
Initially based on (what was) Gilles Pietri's upstream #26684.
PKGREVISION++
---
Module Name: pkgsrc
Committed By: leot
Date: Sun Nov 1 12:14:49 UTC 2020
Modified Files:
pkgsrc/net/youtube-dl: Makefile distinfo
Added Files:
pkgsrc/net/youtube-dl/patches: patch-youtube__dl_extractor_youtube.py
Log Message:
youtube-dl: Fix extraction of YouTube JS player URL (and youtube extractor)
Both versions are probably still present because - also without the patch -
after several retries the JS player URL is fetched.
PKGREVISION++
---
Module Name: pkgsrc
Committed By: leot
Date: Sun Nov 1 18:38:59 UTC 2020
Modified Files:
pkgsrc/net/youtube-dl: Makefile distinfo
Removed Files:
pkgsrc/net/youtube-dl/patches: patch-youtube__dl_extractor_youtube.py
Log Message:
youtube-dl: Update to 20201101.1
Changes:
20201101.1
----------
Core
* [utils] Don't attempt to coerce JS strings to numbers in js_to_json (#26851)
* [downloader/http] Properly handle missing message in SSLError (#26646)
* [downloader/http] Fix access to not yet opened stream in retry
Extractors
* [youtube] Fix JS player URL extraction
* [ytsearch] Fix extraction (#26920)
* [afreecatv] Fix typo (#26970)
* [23video] Relax URL regular expression (#26870)
+ [ustream] Add support for video.ibm.com (#26894)
* [iqiyi] Fix typo (#26884)
+ [expressen] Add support for di.se (#26670)
* [iprima] Improve video id extraction (#26507, #26494)
---
Module Name: pkgsrc
Committed By: leot
Date: Thu Nov 12 14:41:38 UTC 2020
Modified Files:
pkgsrc/net/youtube-dl: Makefile distinfo
Log Message:
youtube-dl: Update to 20201112
Changes:
20201112
--------
Extractors
* [youtube] Rework extractors
-rw-r--r-- | net/youtube-dl/Makefile | 4 | ||||
-rw-r--r-- | net/youtube-dl/distinfo | 11 | ||||
-rw-r--r-- | net/youtube-dl/patches/patch-youtube__dl_extractor_bandcamp.py | 223 | ||||
-rw-r--r-- | net/youtube-dl/patches/patch-youtube__dl_extractor_youtube.py | 19 |
4 files changed, 250 insertions, 7 deletions
diff --git a/net/youtube-dl/Makefile b/net/youtube-dl/Makefile index 3e125c7adb4..26df2c00d4b 100644 --- a/net/youtube-dl/Makefile +++ b/net/youtube-dl/Makefile @@ -1,8 +1,8 @@ -# $NetBSD: Makefile,v 1.214 2020/09/20 09:12:28 leot Exp $ +# $NetBSD: Makefile,v 1.214.2.1 2020/11/24 17:16:33 bsiegert Exp $ # XXX: VERSION_DATE can contains also an optional part that indicates # XXX: possible same day revisions. PKGNAME preserves that dotted part as is. -VERSION_DATE= 2020.09.20 +VERSION_DATE= 2020.11.12 DISTNAME= youtube-dl-${VERSION_DATE} PKGNAME= ${DISTNAME:S/.//:S/.//} CATEGORIES= net diff --git a/net/youtube-dl/distinfo b/net/youtube-dl/distinfo index 65d9234f6f4..e699c487675 100644 --- a/net/youtube-dl/distinfo +++ b/net/youtube-dl/distinfo @@ -1,10 +1,11 @@ -$NetBSD: distinfo,v 1.196 2020/09/20 09:12:28 leot Exp $ +$NetBSD: distinfo,v 1.196.2.1 2020/11/24 17:16:33 bsiegert Exp $ -SHA1 (youtube-dl-2020.09.20.tar.gz) = 2902fa18c30cc3851d5b7fc2932f3f986432b9ac -RMD160 (youtube-dl-2020.09.20.tar.gz) = 107732952ec9238057a1bfc3fd7147ce4026f451 -SHA512 (youtube-dl-2020.09.20.tar.gz) = ed511016d0cfcbbee0cd651f793cb31cdb46d80243f86186de1cc54cb2a2055fae2f5cc3e16f838fc2ba47f3ff6f3b484219c8a707904fbc30193a2b28a1e30c -Size (youtube-dl-2020.09.20.tar.gz) = 3188480 bytes +SHA1 (youtube-dl-2020.11.12.tar.gz) = 04e72d0b0a0e85b79a6c2ac93b7c85254b95b53b +RMD160 (youtube-dl-2020.11.12.tar.gz) = 2afd73b5c09463951086b29298489f0d203a2207 +SHA512 (youtube-dl-2020.11.12.tar.gz) = 7db373f6cc252635a3613ffe0b3b10640e262778105ebbd78b837fe019b0a2609032d2aeb81b239e000a86220aff99d2c018a9a6325adad6981a8ab64048131c +Size (youtube-dl-2020.11.12.tar.gz) = 3188015 bytes SHA1 (patch-setup.py) = a67074ae7cfe5e77847c2f610337ea553eddb69b +SHA1 (patch-youtube__dl_extractor_bandcamp.py) = 81855a3f4f8c03f61fe543eb339c0e67bf52682e SHA1 (patch-youtube__dl_extractor_la7.py) = e246750808305343227060acdc5a38583ef071e9 SHA1 (patch-youtube__dl_extractor_rai.py) = 3dbad7852b38e7364a248a5c9851c50cd2ff9b38 SHA1 (patch-youtube__dl_postprocessor_ffmpeg.py) = f96676170a448d9205d542a7def4beca615a1490 diff --git a/net/youtube-dl/patches/patch-youtube__dl_extractor_bandcamp.py b/net/youtube-dl/patches/patch-youtube__dl_extractor_bandcamp.py new file mode 100644 index 00000000000..bdf6ef4ebaa --- /dev/null +++ b/net/youtube-dl/patches/patch-youtube__dl_extractor_bandcamp.py @@ -0,0 +1,223 @@ +$NetBSD: patch-youtube__dl_extractor_bandcamp.py,v 1.2.2.2 2020/11/24 17:16:33 bsiegert Exp $ + +[bandcamp] Update to handle HTML quoted data + +Adjust the extractor to handle JSON data-* attributes by introducing a +_json_data_extract() method to handle them (and existing existing +patterns in the code). + +Based on Gilles Pietri #26684. + +--- youtube_dl/extractor/bandcamp.py.orig 2020-09-20 05:29:46.000000000 +0000 ++++ youtube_dl/extractor/bandcamp.py +@@ -35,12 +35,15 @@ class BandcampIE(InfoExtractor): + 'ext': 'mp3', + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'duration': 9.8485, ++ 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", ++ 'timestamp': 1354224127, ++ 'upload_date': '20121129', + }, + '_skip': 'There is a limit of 200 free downloads / month for the test song' + }, { + # free download + 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', +- 'md5': '853e35bf34aa1d6fe2615ae612564b36', ++ 'md5': '149170678c0a81a009c69566bf42920a', + 'info_dict': { + 'id': '2650410135', + 'ext': 'aiff', +@@ -79,6 +82,14 @@ class BandcampIE(InfoExtractor): + }, + }] + ++ def _json_data_extract(self, data_key, video_id, webpage): ++ return self._parse_json( ++ self._search_regex( ++ r'data-' + data_key + r'=(["\'])(?P<data>{.+?})\1', ++ webpage, 'JSON data {data_key}'.format(data_key=data_key), ++ group='data', default=None), ++ video_id, transform_source=unescapeHTML) ++ + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') +@@ -91,10 +102,9 @@ class BandcampIE(InfoExtractor): + duration = None + + formats = [] +- track_info = self._parse_json( +- self._search_regex( +- r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', +- webpage, 'track info', default='{}'), title) ++ tralbum_data = self._json_data_extract('tralbum', title, webpage) ++ embed_data = self._json_data_extract('embed', title, webpage) ++ track_info = tralbum_data['trackinfo'][0] + if track_info: + file_ = track_info.get('file') + if isinstance(file_, dict): +@@ -110,38 +120,28 @@ class BandcampIE(InfoExtractor): + 'acodec': ext, + 'abr': int_or_none(abr_str), + }) +- track = track_info.get('title') + track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_number = int_or_none(track_info.get('track_num')) + duration = float_or_none(track_info.get('duration')) + + def extract(key): +- return self._search_regex( +- r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, +- webpage, key, default=None, group='value') ++ for data in tralbum_data['current'], embed_data, tralbum_data: ++ if key in data and data[key]: ++ return data[key] + + artist = extract('artist') ++ track = extract('title') + album = extract('album_title') + timestamp = unified_timestamp( + extract('publish_date') or extract('album_publish_date')) + release_date = unified_strdate(extract('album_release_date')) + +- download_link = self._search_regex( +- r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, +- 'download link', default=None, group='url') ++ download_link = tralbum_data['freeDownloadPage'] + if download_link: +- track_id = self._search_regex( +- r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', +- webpage, 'track id') +- + download_webpage = self._download_webpage( + download_link, track_id, 'Downloading free downloads page') + +- blob = self._parse_json( +- self._search_regex( +- r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, +- 'blob', group='blob'), +- track_id, transform_source=unescapeHTML) ++ blob = self._json_data_extract('blob', track_id, download_webpage) + + info = try_get( + blob, (lambda x: x['digital_items'][0], +@@ -218,7 +218,7 @@ class BandcampIE(InfoExtractor): + } + + +-class BandcampAlbumIE(InfoExtractor): ++class BandcampAlbumIE(BandcampIE): + IE_NAME = 'Bandcamp:album' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' + +@@ -299,26 +299,23 @@ class BandcampAlbumIE(InfoExtractor): + album_id = mobj.group('album_id') + playlist_id = album_id or uploader_id + webpage = self._download_webpage(url, playlist_id) +- track_elements = re.findall( +- r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) ++ ++ tralbum_data = self._json_data_extract('tralbum', album_id, webpage) ++ embed_data = self._json_data_extract('embed', album_id, webpage) ++ title = embed_data.get('album_title') ++ ++ track_elements = tralbum_data['trackinfo'] + if not track_elements: + raise ExtractorError('The page doesn\'t contain any tracks') + # Only tracks with duration info have songs + entries = [ + self.url_result( +- compat_urlparse.urljoin(url, t_path), ++ compat_urlparse.urljoin(url, t['title_link']), + ie=BandcampIE.ie_key(), +- video_title=self._search_regex( +- r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', +- elem_content, 'track title', fatal=False)) +- for elem_content, t_path in track_elements +- if self._html_search_meta('duration', elem_content, default=None)] +- +- title = self._html_search_regex( +- r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', +- webpage, 'title', fatal=False) +- if title: +- title = title.replace(r'\"', '"') ++ video_title=t['title']) ++ for t in track_elements ++ if t['duration']] ++ + return { + '_type': 'playlist', + 'uploader_id': uploader_id, +@@ -328,22 +325,21 @@ class BandcampAlbumIE(InfoExtractor): + } + + +-class BandcampWeeklyIE(InfoExtractor): ++class BandcampWeeklyIE(BandcampIE): + IE_NAME = 'Bandcamp:weekly' + _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://bandcamp.com/?show=224', +- 'md5': 'b00df799c733cf7e0c567ed187dea0fd', ++ 'md5': '61acc9a002bed93986b91168aa3ab433', + 'info_dict': { + 'id': '224', +- 'ext': 'opus', ++ 'ext': 'mp3', + 'title': 'BC Weekly April 4th 2017 - Magic Moments', + 'description': 'md5:5d48150916e8e02d030623a48512c874', + 'duration': 5829.77, + 'release_date': '20170404', + 'series': 'Bandcamp Weekly', + 'episode': 'Magic Moments', +- 'episode_number': 208, + 'episode_id': '224', + } + }, { +@@ -355,13 +351,13 @@ class BandcampWeeklyIE(InfoExtractor): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + +- blob = self._parse_json( +- self._search_regex( +- r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, +- 'blob', group='blob'), +- video_id, transform_source=unescapeHTML) ++ blob = self._json_data_extract('blob', video_id, webpage) + +- show = blob['bcw_show'] ++ show = None ++ for bd in blob['bcw_data']: ++ if blob['bcw_data'][bd].get('expanded'): ++ show = blob['bcw_data'][bd] ++ break + + # This is desired because any invalid show id redirects to `bandcamp.com` + # which happens to expose the latest Bandcamp Weekly episode. +@@ -390,18 +386,6 @@ class BandcampWeeklyIE(InfoExtractor): + if subtitle: + title += ' - %s' % subtitle + +- episode_number = None +- seq = blob.get('bcw_seq') +- +- if seq and isinstance(seq, list): +- try: +- episode_number = next( +- int_or_none(e.get('episode_number')) +- for e in seq +- if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) +- except StopIteration: +- pass +- + return { + 'id': video_id, + 'title': title, +@@ -411,7 +395,6 @@ class BandcampWeeklyIE(InfoExtractor): + 'release_date': unified_strdate(show.get('published_date')), + 'series': 'Bandcamp Weekly', + 'episode': show.get('subtitle'), +- 'episode_number': episode_number, + 'episode_id': compat_str(video_id), + 'formats': formats + } diff --git a/net/youtube-dl/patches/patch-youtube__dl_extractor_youtube.py b/net/youtube-dl/patches/patch-youtube__dl_extractor_youtube.py new file mode 100644 index 00000000000..03d032c4bd2 --- /dev/null +++ b/net/youtube-dl/patches/patch-youtube__dl_extractor_youtube.py @@ -0,0 +1,19 @@ +$NetBSD: patch-youtube__dl_extractor_youtube.py,v 1.2.2.2 2020/11/24 17:16:33 bsiegert Exp $ + +Update JS player URL to current one. + +Via: + + https://github.com/blackjack4494/yt-dlc/pull/35 + +--- youtube_dl/extractor/youtube.py.orig 2020-09-20 05:29:46.000000000 +0000 ++++ youtube_dl/extractor/youtube.py +@@ -2086,7 +2086,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor + + if cipher: + if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): +- ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' ++ ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))' + jsplayer_url_json = self._search_regex( + ASSETS_RE, + embed_webpage if age_gate else video_webpage, |