# # Copyright Michael Orlitzky # # http://michael.orlitzky.com/ # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # http://www.fsf.org/licensing/licenses/gpl.html # require 'src/website' require 'cgi' class Youtube < Website VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i def self.owns_url?(url) return url =~ VALID_YOUTUBE_URL_REGEX end def initialize(url) super # The @format variable just caches the format of the video we're # downloading. Storing it will prevent us from having to calculate # it twice. @format = 0 end def get_video_url() video_id = self.parse_video_id() # The video's URL (the "page data" URL) may be different from the # URL that was passed to the program. We support the /v/video_id # URL format, but that is *not* the main video page where we can # retrieve the "t" parameter. We can only get that from the # /watch?v=video_id form. page_data_url = "http://www.youtube.com/watch?v=#{video_id}" page_data = self.get_page_data(page_data_url) begin # Get the URL map from the page. fmt_url_map = get_format_url_map(page_data) # Figure out which formats are available, and if any are, # choose the best one. available_formats = fmt_url_map.keys() desired_format = get_desired_format(available_formats) # First we cache the format so that when we're asked for the # video filename later, we don't have to recompute the format. @format = desired_format # And then use whatever URL is available for the desired format. # We assume that all available formats will have an entry in the # fmt_url_map hash. video_url = fmt_url_map[desired_format] return video_url rescue StandardError => e # If at first you do not succeed, maybe someone decided to # change some shit. This alternate method parses # url_encoded_fmt_stream_map. fmt_streams = get_fmt_stream_list(page_data) video_url = self.choose_best_fmt_stream_url(fmt_streams) # The "itag" parameter makes the 403 happen. video_url.gsub!(/itag=\d+&/, '') end return video_url end def get_video_filename() # The format -> extension mapping is available on Wikipedia: # # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs # # The default extension is .flv. extension = '.flv' if [18, 22, 35, 37].include?(@format) extension = '.mp4' elsif (@format == 17) extension = '.3gp' end return (self.parse_video_id() + extension) end protected; def choose_best_fmt_stream_url(fmt_stream_urls) # Take a list, generated by get_fmt_stream_list(), and choose the # best URL out of the bunch based on the video format. fmt_stream_urls.each do |fs| if fs =~ /video\/mp4/ and fs =~ /quality=large/ return fs elsif fs =~ /quality=large/ return fs elsif fs =~ /video\/mp4/ return fs else return fs end end end def unicode_unescape(string) # Unescape sequences like '\u0026'. # Ok, only '\u0026' for now. return string.gsub('\u0026', '&') end def get_fmt_stream_list(page_data) # This is another (new?) method of embedding the video URLs. # The url_encoded_fmt_stream_map variable contains a list of URLs # in the form url=foo1,url=foo2... # # It looks like the first one in the list is the highest # quality? Let's just take that one for now. fmt_stream_regex = /\"url_encoded_fmt_stream_map\": \"(.+?)\"/ matches = fmt_stream_regex.match(page_data) if (matches.nil? || matches.length < 2) raise StandardError.new("Could not parse the url_encoded_fmt_stream_map Flash variable.") end urlstring = matches[1] urlstring.gsub!('url=', '') urls = urlstring.split(',') urls.each_index do |idx| urls[idx] = self.unicode_unescape(urls[idx]) urls[idx] = CGI::unescape(urls[idx]) # Strip off everything after the first space in the URL. # I don't know why this works, but if we leave the space # in (encoded, even), Youtube throws us 403 errors. urls[idx].gsub!(/ .+$/, '') end return urls end # Get the video id from the URL. Should be relatively easy, # unless Youtube supports some URL formats of which I'm unaware. def parse_video_id() # Both URLs are fairly easy to parse if you handle # them one at a time. The only tricky situation is when # parameters like "&hl=en" are tacked on to the end. # We'll call /watch?v=video_id the "first form." first_form_video_id_regex = /v=([0-9a-z_\-]+)/i first_form_matches = first_form_video_id_regex.match(@url) return first_form_matches[1] if not (first_form_matches.nil? || first_form_matches.length < 2) # First form didn't work? Try the second. second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i second_form_matches = second_form_video_id_regex.match(@url) return second_form_matches[1] if not (second_form_matches.nil? || second_form_matches.length < 2) # ...and the third. third_form_video_id_regex = /\/([[:alnum:]]+)$/i third_form_matches = third_form_video_id_regex.match(@url) return third_form_matches[1] if not (third_form_matches.nil? || third_form_matches.length < 2) # If we made it here, we couldn't figure out the video id. Yes, # this is fatal, since we don't know where the video file is # located. raise StandardError.new("Could not parse the video id.") end def get_format_url_map(page_data) # Youtube has implemented a new fmt_url_map that (perhaps # unsurprisingly) maps formats to video URLs. This makes it # easyish to parse the video URLs. url_map = {} url_map_regex = /fmt_url_map=([^&\"]+)/ matches = url_map_regex.match(page_data) if (matches.nil? || matches.length < 1) raise StandardError.new("Could not parse the fmt_url_map Flash variable.") end # The map is stored entirely in one Flash variable. The format is # key|value,key|value,... maptext = CGI::unescape(matches[1]) entries = maptext.split(',') entries.each do |entry| key = entry.split('|')[0].to_i value = entry.split('|')[1] url_map[key] = value end if (url_map.length < 1) raise StandardError.new("Could not find any valid format URLs.") end return url_map end def get_desired_format(available_formats) # Check for the presence of formats, in order of preference # (quality). That is, we check for the best formats first. As soon # as a format is found to be available, we return it as the # desired format, since the first format we find is going to be # the best available format. return 37 if available_formats.include?(37) return 22 if available_formats.include?(22) return 35 if available_formats.include?(35) return 18 if available_formats.include?(18) return 34 if available_formats.include?(34) return 17 if available_formats.include?(17) # Available formats can't be empty (we would have raised an error # in get_available_formats), so if there's some unknown format # here we might as well return it as a last resort. return available_formats[0] end end