X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=src%2Fwebsites%2Fyoutube.rb;h=5f87754f9de85e69a3124f22c192f7df57e20a93;hb=a557fae384ed2c8f8782ed09e5428e2cf701acf6;hp=0d9bf398338538a3dfd8e128e851a98680ca46ae;hpb=2c835ed7a247ed5639277bc9674b848722ad998d;p=dead%2Fwhatever-dl.git diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb index 0d9bf39..5f87754 100644 --- a/src/websites/youtube.rb +++ b/src/websites/youtube.rb @@ -17,24 +17,29 @@ # require 'src/website' - -# Needed to download the page, which is in turn -# needed because it contains the video URL. -require 'net/http' -require 'uri' - +require 'cgi' class Youtube < Website - VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?(www\.)?youtube\.com\/((watch\?v=)|(v\/))[[:alnum:]]+(\&.*)?\#?$/ + VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i def self.owns_url?(url) return url =~ VALID_YOUTUBE_URL_REGEX end + + def initialize(url) + super + + # The @format variable just caches the format of the video we're + # downloading. Storing it will prevent us from having to calculate + # it twice. + @format = 0 + end + - def get_video_url(url) - video_id = self.parse_video_id(url) + def get_video_url() + video_id = self.parse_video_id() # The video's URL (the "page data" URL) may be different from the # URL that was passed to the program. We support the /v/video_id @@ -48,35 +53,73 @@ class Youtube < Website t_parameter = self.parse_t_parameter(page_data) video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}" - + + # Figure out which formats are available, and if any are, + # choose the best one. + available_formats = get_available_formats(page_data) + desired_format = get_desired_format(available_formats) + + if not desired_format.nil? + # First we cache the format so that when we're asked for the + # video filename later, we don't have to recompute the format. + @format = desired_format + + # And then stick the format parameter on the end of the URL. + video_url = video_url + "&fmt=#{desired_format}" + end + return video_url end + + def get_video_filename() + # The format -> extension mapping is available on Wikipedia: + # + # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs + # + # The default extension is .flv. + extension = '.flv' + + if [18, 22, 35, 37].include?(@format) + extension = '.mp4' + elsif (@format == 17) + extension = '.3gp' + end + + return (self.parse_video_id() + extension) + end + protected; # Get the video id from the URL. Should be relatively easy, # unless Youtube supports some URL formats of which I'm unaware. - def parse_video_id(url) - # Return nil if we get no matches below. - video_id = nil - + def parse_video_id() # Both URLs are fairly easy to parse if you handle # them one at a time. The only tricky situation is when # parameters like "&hl=en" are tacked on to the end. # We'll call /watch?v=video_id the "first form." - first_form_video_id_regex = /v=([[:alnum:]]+)$/ - first_form_matches = first_form_video_id_regex.match(url) + first_form_video_id_regex = /v=([0-9a-z_\-]+)/i + first_form_matches = first_form_video_id_regex.match(@url) return first_form_matches[1] if not (first_form_matches.nil? || first_form_matches.length < 2) # First form didn't work? Try the second. - second_form_video_id_regex = /\/v\/([[:alnum:]]+)/ - second_form_matches = second_form_video_id_regex.match(url) - video_id = second_form_matches[1] if not (second_form_matches.nil? || - second_form_matches.length < 2) - - return video_id + second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i + second_form_matches = second_form_video_id_regex.match(@url) + return second_form_matches[1] if not (second_form_matches.nil? || + second_form_matches.length < 2) + + # ...and the third. + third_form_video_id_regex = /\/([[:alnum:]]+)$/i + third_form_matches = third_form_video_id_regex.match(@url) + return third_form_matches[1] if not (third_form_matches.nil? || + third_form_matches.length < 2) + + # If we made it here, we couldn't figure out the video id. Yes, + # this is fatal, since we don't know where the video file is + # located. + raise StandardError.new("Could not parse the video id.") end @@ -92,16 +135,46 @@ class Youtube < Website return t_parameter end - - def get_page_data(url) - uri = URI.parse(url) - response = Net::HTTP.start(uri.host, uri.port) do |http| - http.get(uri.request_uri) + def get_available_formats(page_data) + # Parse the list of available formats from the "fmt_list" Flash + # variable. + available_formats = [] + fmt_list_regex = /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/ + matches = fmt_list_regex.match(page_data) + + if matches.nil? + raise StandardError.new("Could not find any valid formats.") + end + + fmts_string = CGI::unescape(matches[1]) + fmts_string.split(',').each do |fmt| + # Each "fmt" will look something like, + # + # 35/640000/9/0/115 + # + # with the format identifier coming before the first slash. + first_slash_idx = fmt.index('/') + available_formats << fmt[0...first_slash_idx].to_i end + + return available_formats + end + - return response.body + def get_desired_format(available_formats) + # Check for the presence of formats, in order of preference + # (quality). That is, we check for the best formats first. As soon + # as a format is found to be available, we return it as the + # desired format, since the first format we find is going to be + # the best available format. + return 37 if available_formats.include?(37) + return 22 if available_formats.include?(22) + return 35 if available_formats.include?(35) + return 18 if available_formats.include?(18) + return 34 if available_formats.include?(34) + return 17 if available_formats.include?(17) end end