X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Fwhatever-dl.git;a=blobdiff_plain;f=src%2Fwebsites%2Fyoutube.rb;h=bdf2c2c4ba7bd28d25ecd1a236e8534142f8d2aa;hp=0d9bf398338538a3dfd8e128e851a98680ca46ae;hb=6e95377fe761bb0988e6e5e27327c0f189a2ecb7;hpb=2c835ed7a247ed5639277bc9674b848722ad998d diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb index 0d9bf39..bdf2c2c 100644 --- a/src/websites/youtube.rb +++ b/src/websites/youtube.rb @@ -17,24 +17,29 @@ # require 'src/website' - -# Needed to download the page, which is in turn -# needed because it contains the video URL. -require 'net/http' -require 'uri' - +require 'cgi' class Youtube < Website - VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?(www\.)?youtube\.com\/((watch\?v=)|(v\/))[[:alnum:]]+(\&.*)?\#?$/ - + VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i + def self.owns_url?(url) return url =~ VALID_YOUTUBE_URL_REGEX end - - def get_video_url(url) - video_id = self.parse_video_id(url) + + def initialize(url) + super + + # The @format variable just caches the format of the video we're + # downloading. Storing it will prevent us from having to calculate + # it twice. + @format = 0 + end + + + def get_video_url() + video_id = self.parse_video_id() # The video's URL (the "page data" URL) may be different from the # URL that was passed to the program. We support the /v/video_id @@ -44,64 +49,201 @@ class Youtube < Website page_data_url = "http://www.youtube.com/watch?v=#{video_id}" page_data = self.get_page_data(page_data_url) - # Magic. - t_parameter = self.parse_t_parameter(page_data) - - video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}" - + begin + # Get the URL map from the page. + fmt_url_map = get_format_url_map(page_data) + + # Figure out which formats are available, and if any are, + # choose the best one. + available_formats = fmt_url_map.keys() + desired_format = get_desired_format(available_formats) + + # First we cache the format so that when we're asked for the + # video filename later, we don't have to recompute the format. + @format = desired_format + + # And then use whatever URL is available for the desired format. + # We assume that all available formats will have an entry in the + # fmt_url_map hash. + video_url = fmt_url_map[desired_format] + return video_url + rescue StandardError => e + # If at first you do not succeed, maybe someone decided to + # change some shit. This alternate method parses + # url_encoded_fmt_stream_map. + fmt_streams = get_fmt_stream_list(page_data) + video_url = self.choose_best_fmt_stream_url(fmt_streams) + + # A duplicated "itag" parameter results in a 403. + itag_regex = /&itag=\d+/ + matches = video_url.scan(itag_regex) + + if matches.length > 1 + # Get rid of the first occurrence. + video_url.sub!(itag_regex, '') + end + end + return video_url end - + + def get_video_filename() + # The format -> extension mapping is available on Wikipedia: + # + # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs + # + # The default extension is .flv. + extension = '.flv' + + if [18, 22, 35, 37].include?(@format) + extension = '.mp4' + elsif (@format == 17) + extension = '.3gp' + end + + return (self.parse_video_id() + extension) + end + + protected; + def choose_best_fmt_stream_url(fmt_stream_urls) + # Take a list, generated by get_fmt_stream_list(), and choose the + # best URL out of the bunch based on the video format. + fmt_stream_urls.each do |fs| + if fs =~ /video\/mp4/ and fs =~ /quality=large/ + return fs + elsif fs =~ /quality=large/ + return fs + elsif fs =~ /video\/mp4/ + return fs + else + return fs + end + end + end + + + def unicode_unescape(string) + # Unescape sequences like '\u0026'. + # Ok, only '\u0026' for now. + return string.gsub('\u0026', '&') + end + + + def get_fmt_stream_list(page_data) + # This is another (new?) method of embedding the video URLs. + # The url_encoded_fmt_stream_map variable contains a list of URLs + # in the form url=foo1,url=foo2... + # + # It looks like the first one in the list is the highest + # quality? Let's just take that one for now. + fmt_stream_regex = /\"url_encoded_fmt_stream_map\": \"(.+?)\"/ + + matches = fmt_stream_regex.match(page_data) + + if (matches.nil? || matches.length < 2) + raise StandardError.new("Could not parse the url_encoded_fmt_stream_map Flash variable.") + end + + urlstring = matches[1] + urlstring.gsub!('url=', '') + urls = urlstring.split(',') + + urls.each_index do |idx| + urls[idx] = self.unicode_unescape(urls[idx]) + urls[idx] = CGI::unescape(urls[idx]) + # Strip off everything after the first space in the URL. + # I don't know why this works, but if we leave the space + # in (encoded, even), Youtube throws us 403 errors. + urls[idx].gsub!(/ .+$/, '') + end + + return urls + end + + # Get the video id from the URL. Should be relatively easy, # unless Youtube supports some URL formats of which I'm unaware. - def parse_video_id(url) - # Return nil if we get no matches below. - video_id = nil - + def parse_video_id() # Both URLs are fairly easy to parse if you handle # them one at a time. The only tricky situation is when # parameters like "&hl=en" are tacked on to the end. # We'll call /watch?v=video_id the "first form." - first_form_video_id_regex = /v=([[:alnum:]]+)$/ - first_form_matches = first_form_video_id_regex.match(url) + first_form_video_id_regex = /v=([0-9a-z_\-]+)/i + first_form_matches = first_form_video_id_regex.match(@url) return first_form_matches[1] if not (first_form_matches.nil? || first_form_matches.length < 2) # First form didn't work? Try the second. - second_form_video_id_regex = /\/v\/([[:alnum:]]+)/ - second_form_matches = second_form_video_id_regex.match(url) - video_id = second_form_matches[1] if not (second_form_matches.nil? || - second_form_matches.length < 2) - - return video_id + second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i + second_form_matches = second_form_video_id_regex.match(@url) + return second_form_matches[1] if not (second_form_matches.nil? || + second_form_matches.length < 2) + + # ...and the third. + third_form_video_id_regex = /\/([[:alnum:]]+)$/i + third_form_matches = third_form_video_id_regex.match(@url) + return third_form_matches[1] if not (third_form_matches.nil? || + third_form_matches.length < 2) + + # If we made it here, we couldn't figure out the video id. Yes, + # this is fatal, since we don't know where the video file is + # located. + raise StandardError.new("Could not parse the video id.") end - # Parse out the "t" parameter from the video's page. I'm not sure - # what "t" stands for, but it's located in some JSON, and is required - # for the final video URL to work. - def parse_t_parameter(page_data) - t_parameter = nil - - t_parameter_regex = /\"t\"\:[[:space:]]\"([^\"]+?)\"/ - matches = t_parameter_regex.match(page_data) - t_parameter = matches[1] if not (matches.nil? || matches.length < 2) - return t_parameter - end + def get_format_url_map(page_data) + # Youtube has implemented a new fmt_url_map that (perhaps + # unsurprisingly) maps formats to video URLs. This makes it + # easyish to parse the video URLs. + url_map = {} + url_map_regex = /fmt_url_map=([^&\"]+)/ + + matches = url_map_regex.match(page_data) + + if (matches.nil? || matches.length < 1) + raise StandardError.new("Could not parse the fmt_url_map Flash variable.") + end - - def get_page_data(url) - uri = URI.parse(url) + # The map is stored entirely in one Flash variable. The format is + # key|value,key|value,... + maptext = CGI::unescape(matches[1]) + entries = maptext.split(',') + entries.each do |entry| + key = entry.split('|')[0].to_i + value = entry.split('|')[1] + url_map[key] = value + end - response = Net::HTTP.start(uri.host, uri.port) do |http| - http.get(uri.request_uri) + if (url_map.length < 1) + raise StandardError.new("Could not find any valid format URLs.") end - return response.body + return url_map end - + + + def get_desired_format(available_formats) + # Check for the presence of formats, in order of preference + # (quality). That is, we check for the best formats first. As soon + # as a format is found to be available, we return it as the + # desired format, since the first format we find is going to be + # the best available format. + return 37 if available_formats.include?(37) + return 22 if available_formats.include?(22) + return 35 if available_formats.include?(35) + return 18 if available_formats.include?(18) + return 34 if available_formats.include?(34) + return 17 if available_formats.include?(17) + + # Available formats can't be empty (we would have raised an error + # in get_available_formats), so if there's some unknown format + # here we might as well return it as a last resort. + return available_formats[0] + end + end