From e72d484c8bf3e719f3f65ada1398772853836a56 Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Sat, 6 Aug 2011 12:55:20 -0400 Subject: [PATCH] Use single quotes around the URL in our 'wget' command. Add a fallback method for parsing Youtube video URLs. --- src/websites/youtube.rb | 76 ++++++++++++++++++++++++++++++++--------- src/wget_downloader.rb | 2 +- 2 files changed, 61 insertions(+), 17 deletions(-) diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb index c2a37d6..c51ba92 100644 --- a/src/websites/youtube.rb +++ b/src/websites/youtube.rb @@ -49,22 +49,38 @@ class Youtube < Website page_data_url = "http://www.youtube.com/watch?v=#{video_id}" page_data = self.get_page_data(page_data_url) - # Get the URL map from the page. - fmt_url_map = get_format_url_map(page_data) - - # Figure out which formats are available, and if any are, - # choose the best one. - available_formats = fmt_url_map.keys() - desired_format = get_desired_format(available_formats) - - # First we cache the format so that when we're asked for the - # video filename later, we don't have to recompute the format. - @format = desired_format - - # And then use whatever URL is available for the desired format. - # We assume that all available formats will have an entry in the - # fmt_url_map hash. - video_url = fmt_url_map[desired_format] + begin + # Get the URL map from the page. + fmt_url_map = get_format_url_map(page_data) + + # Figure out which formats are available, and if any are, + # choose the best one. + available_formats = fmt_url_map.keys() + desired_format = get_desired_format(available_formats) + + # First we cache the format so that when we're asked for the + # video filename later, we don't have to recompute the format. + @format = desired_format + + # And then use whatever URL is available for the desired format. + # We assume that all available formats will have an entry in the + # fmt_url_map hash. + video_url = fmt_url_map[desired_format] + + return video_url + rescue StandardError => e + # If at first you do not succeed, maybe someone decided to + # change some shit. This alternate method parses + # url_encoded_fmt_stream_map. + fmt_streams = get_fmt_stream_list(page_data) + video_url = self.unicode_unescape(fmt_streams[0]) + video_url = CGI::unescape(video_url) + + # Strip off everything after the first space in the URL. + # I don't know why this works, but if we leave the space + # in (encoded, even), Youtube throws us 403 errors. + video_url.gsub!(/ .+$/, '') + end return video_url end @@ -90,6 +106,34 @@ class Youtube < Website protected; + def unicode_unescape(string) + # Unescape sequences like '\u0026'. + # Ok, only '\u0026' for now. + return string.gsub('\u0026', '&') + end + + def get_fmt_stream_list(page_data) + # This is another (new?) method of embedding the video URLs. + # The url_encoded_fmt_stream_map variable contains a list of URLs + # in the form url=foo1,url=foo2... + # + # It looks like the first one in the list is the highest + # quality? Let's just take that one for now. + fmt_stream_regex = /\"url_encoded_fmt_stream_map\": \"(.+?)\"/ + + matches = fmt_stream_regex.match(page_data) + + if (matches.nil? || matches.length < 2) + raise StandardError.new("Could not parse the url_encoded_fmt_stream_map Flash variable.") + end + + urlstring = matches[1] + urlstring.gsub!('url=', '') + urls = urlstring.split(',') + return urls + end + + # Get the video id from the URL. Should be relatively easy, # unless Youtube supports some URL formats of which I'm unaware. def parse_video_id() diff --git a/src/wget_downloader.rb b/src/wget_downloader.rb index ce4c760..82235d5 100644 --- a/src/wget_downloader.rb +++ b/src/wget_downloader.rb @@ -34,7 +34,7 @@ class WgetDownloader < Downloader end # This one's easy. - cmd = "wget #{options} -O \"#{outfile}\" \"#{url}\"" + cmd = "wget #{options} -O '#{outfile}' '#{url}'" puts "\nExecuting external command: #{cmd}\n\n" Kernel.exec(cmd) end -- 2.44.2