X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Fwhatever-dl.git;a=blobdiff_plain;f=src%2Fwebsites%2Fyoutube.rb;h=bdf2c2c4ba7bd28d25ecd1a236e8534142f8d2aa;hp=ed5ab4b8a96efa62ee49379caf205f9b89c5e69c;hb=6e95377fe761bb0988e6e5e27327c0f189a2ecb7;hpb=dbde07faafd93f8a9503c1fedfe317723da7cac5 diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb index ed5ab4b..bdf2c2c 100644 --- a/src/websites/youtube.rb +++ b/src/websites/youtube.rb @@ -22,7 +22,7 @@ require 'cgi' class Youtube < Website VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i - + def self.owns_url?(url) return url =~ VALID_YOUTUBE_URL_REGEX end @@ -30,14 +30,14 @@ class Youtube < Website def initialize(url) super - + # The @format variable just caches the format of the video we're # downloading. Storing it will prevent us from having to calculate # it twice. @format = 0 end - + def get_video_url() video_id = self.parse_video_id() @@ -49,25 +49,41 @@ class Youtube < Website page_data_url = "http://www.youtube.com/watch?v=#{video_id}" page_data = self.get_page_data(page_data_url) - # Magic. - t_parameter = self.parse_t_parameter(page_data) - - video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}" - - # Figure out which formats are available, and if any are, - # choose the best one. - available_formats = get_available_formats(page_data) - desired_format = get_desired_format(available_formats) - - if not desired_format.nil? + begin + # Get the URL map from the page. + fmt_url_map = get_format_url_map(page_data) + + # Figure out which formats are available, and if any are, + # choose the best one. + available_formats = fmt_url_map.keys() + desired_format = get_desired_format(available_formats) + # First we cache the format so that when we're asked for the # video filename later, we don't have to recompute the format. @format = desired_format - # And then stick the format parameter on the end of the URL. - video_url = video_url + "&fmt=#{desired_format}" + # And then use whatever URL is available for the desired format. + # We assume that all available formats will have an entry in the + # fmt_url_map hash. + video_url = fmt_url_map[desired_format] + return video_url + rescue StandardError => e + # If at first you do not succeed, maybe someone decided to + # change some shit. This alternate method parses + # url_encoded_fmt_stream_map. + fmt_streams = get_fmt_stream_list(page_data) + video_url = self.choose_best_fmt_stream_url(fmt_streams) + + # A duplicated "itag" parameter results in a 403. + itag_regex = /&itag=\d+/ + matches = video_url.scan(itag_regex) + + if matches.length > 1 + # Get rid of the first occurrence. + video_url.sub!(itag_regex, '') + end end - + return video_url end @@ -79,19 +95,75 @@ class Youtube < Website # # The default extension is .flv. extension = '.flv' - + if [18, 22, 35, 37].include?(@format) extension = '.mp4' elsif (@format == 17) extension = '.3gp' end - + return (self.parse_video_id() + extension) end - + protected; + def choose_best_fmt_stream_url(fmt_stream_urls) + # Take a list, generated by get_fmt_stream_list(), and choose the + # best URL out of the bunch based on the video format. + fmt_stream_urls.each do |fs| + if fs =~ /video\/mp4/ and fs =~ /quality=large/ + return fs + elsif fs =~ /quality=large/ + return fs + elsif fs =~ /video\/mp4/ + return fs + else + return fs + end + end + end + + + def unicode_unescape(string) + # Unescape sequences like '\u0026'. + # Ok, only '\u0026' for now. + return string.gsub('\u0026', '&') + end + + + def get_fmt_stream_list(page_data) + # This is another (new?) method of embedding the video URLs. + # The url_encoded_fmt_stream_map variable contains a list of URLs + # in the form url=foo1,url=foo2... + # + # It looks like the first one in the list is the highest + # quality? Let's just take that one for now. + fmt_stream_regex = /\"url_encoded_fmt_stream_map\": \"(.+?)\"/ + + matches = fmt_stream_regex.match(page_data) + + if (matches.nil? || matches.length < 2) + raise StandardError.new("Could not parse the url_encoded_fmt_stream_map Flash variable.") + end + + urlstring = matches[1] + urlstring.gsub!('url=', '') + urls = urlstring.split(',') + + urls.each_index do |idx| + urls[idx] = self.unicode_unescape(urls[idx]) + urls[idx] = CGI::unescape(urls[idx]) + # Strip off everything after the first space in the URL. + # I don't know why this works, but if we leave the space + # in (encoded, even), Youtube throws us 403 errors. + urls[idx].gsub!(/ .+$/, '') + end + + return urls + end + + # Get the video id from the URL. Should be relatively easy, # unless Youtube supports some URL formats of which I'm unaware. def parse_video_id() @@ -123,55 +195,35 @@ class Youtube < Website end - # Parse out the "t" parameter from the video's page. I'm not sure - # what "t" stands for, but it's required for the final video URL to - # work. It can be stored in either JSON or URL parameters. - def parse_t_parameter(page_data) - t_parameter = nil - - t_parameter_regexes = [ /\"t\"\:[[:space:]]\"([^\"]+?)\"/, - /&t=([^&\"\\]+)/ ] - matches = t_parameter_regexes.map { |tpr| tpr.match(page_data) } - if matches.nitems == 0 - raise StandardError.new("Could not parse the 't' parameter.") - end - - first_match = matches.compact[0] - t_parameter = CGI::unescape(first_match[1]) - - return t_parameter - end - + def get_format_url_map(page_data) + # Youtube has implemented a new fmt_url_map that (perhaps + # unsurprisingly) maps formats to video URLs. This makes it + # easyish to parse the video URLs. + url_map = {} + url_map_regex = /fmt_url_map=([^&\"]+)/ - def get_available_formats(page_data) - # Parse the list of available formats from the "fmt_list" - # variable. It can be stored as either a Flash variable (JSON - # notation), or as URL parameter. - available_formats = [] - fmt_list_regexes = [ /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/, - /fmt_list=([^&\"\\]+)/ ] + matches = url_map_regex.match(page_data) - matches = fmt_list_regexes.map { |flr| flr.match(page_data) } + if (matches.nil? || matches.length < 1) + raise StandardError.new("Could not parse the fmt_url_map Flash variable.") + end - if matches.nitems == 0 - raise StandardError.new("Could not find any valid formats.") + # The map is stored entirely in one Flash variable. The format is + # key|value,key|value,... + maptext = CGI::unescape(matches[1]) + entries = maptext.split(',') + entries.each do |entry| + key = entry.split('|')[0].to_i + value = entry.split('|')[1] + url_map[key] = value end - first_match = matches.compact[0] - fmts_string = CGI::unescape(first_match[1]) - - fmts_string.split(',').each do |fmt| - # Each "fmt" will look something like, - # - # 35/640000/9/0/115 - # - # with the format identifier coming before the first slash. - first_slash_idx = fmt.index('/') - available_formats << fmt[0...first_slash_idx].to_i + if (url_map.length < 1) + raise StandardError.new("Could not find any valid format URLs.") end - - return available_formats + + return url_map end @@ -187,6 +239,11 @@ class Youtube < Website return 18 if available_formats.include?(18) return 34 if available_formats.include?(34) return 17 if available_formats.include?(17) + + # Available formats can't be empty (we would have raised an error + # in get_available_formats), so if there's some unknown format + # here we might as well return it as a last resort. + return available_formats[0] end - + end