# # Copyright Michael Orlitzky # # http://michael.orlitzky.com/ # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # http://www.fsf.org/licensing/licenses/gpl.html # require 'src/website' require 'cgi' class Youtube < Website VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/))[a-z0-9_\-]+(\&.*)?\#?$/i def self.owns_url?(url) return url =~ VALID_YOUTUBE_URL_REGEX end def initialize(url) super # The @format variable just caches the format of the video we're # downloading. Storing it will prevent us from having to calculate # it twice. @format = 0 end def get_video_url() video_id = self.parse_video_id() # The video's URL (the "page data" URL) may be different from the # URL that was passed to the program. We support the /v/video_id # URL format, but that is *not* the main video page where we can # retrieve the "t" parameter. We can only get that from the # /watch?v=video_id form. page_data_url = "http://www.youtube.com/watch?v=#{video_id}" page_data = self.get_page_data(page_data_url) # Magic. t_parameter = self.parse_t_parameter(page_data) video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}" # Figure out which formats are available, and if any are, # choose the best one. available_formats = get_available_formats(page_data) desired_format = get_desired_format(available_formats) if not desired_format.nil? # First we cache the format so that when we're asked for the # video filename later, we don't have to recompute the format. @format = desired_format # And then stick the format parameter on the end of the URL. video_url = video_url + "&fmt=#{desired_format}" end return video_url end def get_video_filename() # The format -> extension mapping is available on Wikipedia: # # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs # # The default extension is .flv. extension = '.flv' if [18, 22, 35, 37].include?(@format) extension = '.mp4' elsif (@format == 17) extension = '.3gp' end return (self.parse_video_id() + extension) end protected; # Get the video id from the URL. Should be relatively easy, # unless Youtube supports some URL formats of which I'm unaware. def parse_video_id() # Return nil if we get no matches below. video_id = nil # Both URLs are fairly easy to parse if you handle # them one at a time. The only tricky situation is when # parameters like "&hl=en" are tacked on to the end. # We'll call /watch?v=video_id the "first form." first_form_video_id_regex = /v=([0-9a-z_\-]+)/i first_form_matches = first_form_video_id_regex.match(@url) return first_form_matches[1] if not (first_form_matches.nil? || first_form_matches.length < 2) # First form didn't work? Try the second. second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i second_form_matches = second_form_video_id_regex.match(@url) video_id = second_form_matches[1] if not (second_form_matches.nil? || second_form_matches.length < 2) return video_id end # Parse out the "t" parameter from the video's page. I'm not sure # what "t" stands for, but it's located in some JSON, and is required # for the final video URL to work. def parse_t_parameter(page_data) t_parameter = nil t_parameter_regex = /\"t\"\:[[:space:]]\"([^\"]+?)\"/ matches = t_parameter_regex.match(page_data) t_parameter = matches[1] if not (matches.nil? || matches.length < 2) return t_parameter end def get_available_formats(page_data) # Parse the list of available formats from the "fmt_list" Flash # variable. available_formats = [] fmt_list_regex = /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/ matches = fmt_list_regex.match(page_data) if matches.nil? return nil else fmts_string = CGI::unescape(matches[1]) fmts_string.split(',').each do |fmt| # Each "fmt" will look something like, # # 35/640000/9/0/115 # # with the format identifier coming before the first slash. first_slash_idx = fmt.index('/') available_formats << fmt[0...first_slash_idx].to_i end end return available_formats end def get_desired_format(available_formats) # Check for the presence of formats, in order of preference # (quality). That is, we check for the best formats first. As soon # as a format is found to be available, we return it as the # desired format, since the first format we find is going to be # the best available format. return 37 if available_formats.include?(37) return 22 if available_formats.include?(22) return 35 if available_formats.include?(35) return 18 if available_formats.include?(18) return 34 if available_formats.include?(34) return 17 if available_formats.include?(17) end end