X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=src%2Fwebsites%2Fyoutube.rb;h=5f87754f9de85e69a3124f22c192f7df57e20a93;hb=a557fae384ed2c8f8782ed09e5428e2cf701acf6;hp=c8a1cad008fba578efd301ef53c7ee4b5bd44c6f;hpb=e756e0b650774f2503702512ccdc02e86eee1788;p=dead%2Fwhatever-dl.git diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb index c8a1cad..5f87754 100644 --- a/src/websites/youtube.rb +++ b/src/websites/youtube.rb @@ -17,16 +17,26 @@ # require 'src/website' - +require 'cgi' class Youtube < Website - VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/))[a-z0-9_\-]+(\&.*)?\#?$/i + VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i def self.owns_url?(url) return url =~ VALID_YOUTUBE_URL_REGEX end + + def initialize(url) + super + + # The @format variable just caches the format of the video we're + # downloading. Storing it will prevent us from having to calculate + # it twice. + @format = 0 + end + def get_video_url() video_id = self.parse_video_id() @@ -43,13 +53,40 @@ class Youtube < Website t_parameter = self.parse_t_parameter(page_data) video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}" - + + # Figure out which formats are available, and if any are, + # choose the best one. + available_formats = get_available_formats(page_data) + desired_format = get_desired_format(available_formats) + + if not desired_format.nil? + # First we cache the format so that when we're asked for the + # video filename later, we don't have to recompute the format. + @format = desired_format + + # And then stick the format parameter on the end of the URL. + video_url = video_url + "&fmt=#{desired_format}" + end + return video_url end def get_video_filename() - return (self.parse_video_id() + '.flv') + # The format -> extension mapping is available on Wikipedia: + # + # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs + # + # The default extension is .flv. + extension = '.flv' + + if [18, 22, 35, 37].include?(@format) + extension = '.mp4' + elsif (@format == 17) + extension = '.3gp' + end + + return (self.parse_video_id() + extension) end @@ -58,9 +95,6 @@ class Youtube < Website # Get the video id from the URL. Should be relatively easy, # unless Youtube supports some URL formats of which I'm unaware. def parse_video_id() - # Return nil if we get no matches below. - video_id = nil - # Both URLs are fairly easy to parse if you handle # them one at a time. The only tricky situation is when # parameters like "&hl=en" are tacked on to the end. @@ -73,10 +107,19 @@ class Youtube < Website # First form didn't work? Try the second. second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i second_form_matches = second_form_video_id_regex.match(@url) - video_id = second_form_matches[1] if not (second_form_matches.nil? || - second_form_matches.length < 2) - - return video_id + return second_form_matches[1] if not (second_form_matches.nil? || + second_form_matches.length < 2) + + # ...and the third. + third_form_video_id_regex = /\/([[:alnum:]]+)$/i + third_form_matches = third_form_video_id_regex.match(@url) + return third_form_matches[1] if not (third_form_matches.nil? || + third_form_matches.length < 2) + + # If we made it here, we couldn't figure out the video id. Yes, + # this is fatal, since we don't know where the video file is + # located. + raise StandardError.new("Could not parse the video id.") end @@ -93,5 +136,45 @@ class Youtube < Website return t_parameter end + + def get_available_formats(page_data) + # Parse the list of available formats from the "fmt_list" Flash + # variable. + available_formats = [] + fmt_list_regex = /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/ + matches = fmt_list_regex.match(page_data) + + if matches.nil? + raise StandardError.new("Could not find any valid formats.") + end + + fmts_string = CGI::unescape(matches[1]) + fmts_string.split(',').each do |fmt| + # Each "fmt" will look something like, + # + # 35/640000/9/0/115 + # + # with the format identifier coming before the first slash. + first_slash_idx = fmt.index('/') + available_formats << fmt[0...first_slash_idx].to_i + end + + return available_formats + end + + + def get_desired_format(available_formats) + # Check for the presence of formats, in order of preference + # (quality). That is, we check for the best formats first. As soon + # as a format is found to be available, we return it as the + # desired format, since the first format we find is going to be + # the best available format. + return 37 if available_formats.include?(37) + return 22 if available_formats.include?(22) + return 35 if available_formats.include?(35) + return 18 if available_formats.include?(18) + return 34 if available_formats.include?(34) + return 17 if available_formats.include?(17) + end end