X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=src%2Fwebsites%2Fyoutube.rb;h=532d8cae4653560fec7341d91d2328ca47027e12;hb=14b76291067609e454d5c18e9ce8bc2a0d09987e;hp=6766af2d654e4438b7a5dadc07ee61f35b4fd2ff;hpb=af614c64b3d5998471af5e54b3d8f36d3e00cc63;p=dead%2Fwhatever-dl.git diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb index 6766af2..532d8ca 100644 --- a/src/websites/youtube.rb +++ b/src/websites/youtube.rb @@ -17,24 +17,29 @@ # require 'src/website' - -# Needed to download the page, which is in turn -# needed because it contains the video URL. -require 'net/http' -require 'uri' - +require 'cgi' class Youtube < Website - VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?(www\.)?youtube\.com\/((watch\?v=)|(v\/))[[:alnum:]]+(\&.*)?$/ + VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/))[a-z0-9_\-]+(\&.*)?\#?$/i def self.owns_url?(url) return url =~ VALID_YOUTUBE_URL_REGEX end + + def initialize(url) + super + + # The @format variable just caches the format of the video we're + # downloading. Storing it will prevent us from having to calculate + # it twice. + @format = 0 + end + - def get_video_url(url) - video_id = self.parse_video_id(url) + def get_video_url() + video_id = self.parse_video_id() # The video's URL (the "page data" URL) may be different from the # URL that was passed to the program. We support the /v/video_id @@ -48,16 +53,48 @@ class Youtube < Website t_parameter = self.parse_t_parameter(page_data) video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}" - + + # Figure out which formats are available, and if any are, + # choose the best one. + available_formats = get_available_formats(page_data) + desired_format = get_desired_format(available_formats) + + if not desired_format.nil? + # First we cache the format so that when we're asked for the + # video filename later, we don't have to recompute the format. + @format = desired_format + + # And then stick the format parameter on the end of the URL. + video_url = video_url + "&fmt=#{desired_format}" + end + return video_url end + + def get_video_filename() + # The format -> extension mapping is available on Wikipedia: + # + # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs + # + # The default extension is .flv. + extension = '.flv' + + if [18, 22, 35, 37].include?(@format) + extension = '.mp4' + elsif (@format == 17) + extension = '.3gp' + end + + return (self.parse_video_id() + extension) + end + protected; # Get the video id from the URL. Should be relatively easy, # unless Youtube supports some URL formats of which I'm unaware. - def parse_video_id(url) + def parse_video_id() # Return nil if we get no matches below. video_id = nil @@ -65,14 +102,14 @@ class Youtube < Website # them one at a time. The only tricky situation is when # parameters like "&hl=en" are tacked on to the end. # We'll call /watch?v=video_id the "first form." - first_form_video_id_regex = /v=([[:alnum:]]+)$/ - first_form_matches = first_form_video_id_regex.match(url) + first_form_video_id_regex = /v=([0-9a-z_\-]+)/i + first_form_matches = first_form_video_id_regex.match(@url) return first_form_matches[1] if not (first_form_matches.nil? || first_form_matches.length < 2) # First form didn't work? Try the second. - second_form_video_id_regex = /\/v\/([[:alnum:]]+)/ - second_form_matches = second_form_video_id_regex.match(url) + second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i + second_form_matches = second_form_video_id_regex.match(@url) video_id = second_form_matches[1] if not (second_form_matches.nil? || second_form_matches.length < 2) @@ -86,22 +123,54 @@ class Youtube < Website def parse_t_parameter(page_data) t_parameter = nil - t_parameter_regex = /\"t\"\:[[:space:]]\"([[:alnum:]]+)\"/ + t_parameter_regex = /\"t\"\:[[:space:]]\"([^\"]+?)\"/ matches = t_parameter_regex.match(page_data) t_parameter = matches[1] if not (matches.nil? || matches.length < 2) return t_parameter end - - def get_page_data(url) - uri = URI.parse(url) - response = Net::HTTP.start(uri.host, uri.port) do |http| - http.get(uri.request_uri) + def get_available_formats(page_data) + # Parse the list of available formats from the "fmt_list" Flash + # variable. + available_formats = [] + fmt_list_regex = /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/ + matches = fmt_list_regex.match(page_data) + + if matches.nil? + return nil + else + fmts_string = CGI::unescape(matches[1]) + + fmts_string.split(',').each do |fmt| + # Each "fmt" will look something like, + # + # 35/640000/9/0/115 + # + # with the format identifier coming before the first slash. + first_slash_idx = fmt.index('/') + available_formats << fmt[0...first_slash_idx].to_i + end + end + + return available_formats + end + - return response.body + def get_desired_format(available_formats) + # Check for the presence of formats, in order of preference + # (quality). That is, we check for the best formats first. As soon + # as a format is found to be available, we return it as the + # desired format, since the first format we find is going to be + # the best available format. + return 37 if available_formats.include?(37) + return 22 if available_formats.include?(22) + return 35 if available_formats.include?(35) + return 18 if available_formats.include?(18) + return 34 if available_formats.include?(34) + return 17 if available_formats.include?(17) end end