X-Git-Url: https://gitweb.michael.orlitzky.com/?a=blobdiff_plain;ds=sidebyside;f=src%2Fwebsites%2Fyoutube.rb;h=2ab4678c11dae6660c0128765281eefffd38e61b;hb=8e886df259246365023322b78f58e4037cb536a4;hp=c51ba9264e1ac056845e3c8c3d67dafe44bc1dd3;hpb=e72d484c8bf3e719f3f65ada1398772853836a56;p=dead%2Fwhatever-dl.git diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb index c51ba92..2ab4678 100644 --- a/src/websites/youtube.rb +++ b/src/websites/youtube.rb @@ -66,20 +66,22 @@ class Youtube < Website # We assume that all available formats will have an entry in the # fmt_url_map hash. video_url = fmt_url_map[desired_format] - return video_url - rescue StandardError => e + rescue StandardError # If at first you do not succeed, maybe someone decided to # change some shit. This alternate method parses # url_encoded_fmt_stream_map. fmt_streams = get_fmt_stream_list(page_data) - video_url = self.unicode_unescape(fmt_streams[0]) - video_url = CGI::unescape(video_url) + video_url = self.choose_best_fmt_stream_url(fmt_streams) - # Strip off everything after the first space in the URL. - # I don't know why this works, but if we leave the space - # in (encoded, even), Youtube throws us 403 errors. - video_url.gsub!(/ .+$/, '') + # A duplicated "itag" parameter results in a 403. + itag_regex = /&itag=\d+/ + matches = video_url.scan(itag_regex) + + if matches.length > 1 + # Get rid of the first occurrence. + video_url.sub!(itag_regex, '') + end end return video_url @@ -106,12 +108,30 @@ class Youtube < Website protected; + def choose_best_fmt_stream_url(fmt_stream_urls) + # Take a list, generated by get_fmt_stream_list(), and choose the + # best URL out of the bunch based on the video format. + fmt_stream_urls.each do |fs| + if fs =~ /video\/mp4/ and fs =~ /quality=large/ + return fs + elsif fs =~ /quality=large/ + return fs + elsif fs =~ /video\/mp4/ + return fs + else + return fs + end + end + end + + def unicode_unescape(string) # Unescape sequences like '\u0026'. # Ok, only '\u0026' for now. return string.gsub('\u0026', '&') end + def get_fmt_stream_list(page_data) # This is another (new?) method of embedding the video URLs. # The url_encoded_fmt_stream_map variable contains a list of URLs @@ -130,6 +150,16 @@ class Youtube < Website urlstring = matches[1] urlstring.gsub!('url=', '') urls = urlstring.split(',') + + urls.each_index do |idx| + urls[idx] = self.unicode_unescape(urls[idx]) + urls[idx] = CGI::unescape(urls[idx]) + # Strip off everything after the first space in the URL. + # I don't know why this works, but if we leave the space + # in (encoded, even), Youtube throws us 403 errors. + urls[idx].gsub!(/ .+$/, '') + end + return urls end @@ -143,20 +173,23 @@ class Youtube < Website # We'll call /watch?v=video_id the "first form." first_form_video_id_regex = /v=([0-9a-z_\-]+)/i first_form_matches = first_form_video_id_regex.match(@url) - return first_form_matches[1] if not (first_form_matches.nil? || - first_form_matches.length < 2) + if not first_form_matches.nil? || first_form_matches.length < 2 + return first_form_matches[1] + end # First form didn't work? Try the second. second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i second_form_matches = second_form_video_id_regex.match(@url) - return second_form_matches[1] if not (second_form_matches.nil? || - second_form_matches.length < 2) + if not second_form_matches.nil? || second_form_matches.length < 2 + return second_form_matches[1] + end # ...and the third. third_form_video_id_regex = /\/([[:alnum:]]+)$/i third_form_matches = third_form_video_id_regex.match(@url) - return third_form_matches[1] if not (third_form_matches.nil? || - third_form_matches.length < 2) + if not third_form_matches.nil? || third_form_matches.length < 2 + return third_form_matches[1] + end # If we made it here, we couldn't figure out the video id. Yes, # this is fatal, since we don't know where the video file is