class Youtube < Website
VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i
-
+
def self.owns_url?(url)
return url =~ VALID_YOUTUBE_URL_REGEX
end
def initialize(url)
super
-
+
# The @format variable just caches the format of the video we're
# downloading. Storing it will prevent us from having to calculate
# it twice.
@format = 0
end
-
+
def get_video_url()
video_id = self.parse_video_id()
page_data_url = "http://www.youtube.com/watch?v=#{video_id}"
page_data = self.get_page_data(page_data_url)
- # Magic.
- t_parameter = self.parse_t_parameter(page_data)
-
- video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}"
+ # Get the URL map from the page.
+ fmt_url_map = get_format_url_map(page_data)
# Figure out which formats are available, and if any are,
# choose the best one.
- available_formats = get_available_formats(page_data)
+ available_formats = fmt_url_map.keys()
desired_format = get_desired_format(available_formats)
-
- if not desired_format.nil?
- # First we cache the format so that when we're asked for the
- # video filename later, we don't have to recompute the format.
- @format = desired_format
-
- # And then stick the format parameter on the end of the URL.
- video_url = video_url + "&fmt=#{desired_format}"
- end
-
+
+ # First we cache the format so that when we're asked for the
+ # video filename later, we don't have to recompute the format.
+ @format = desired_format
+
+ # And then use whatever URL is available for the desired format.
+ # We assume that all available formats will have an entry in the
+ # fmt_url_map hash.
+ video_url = fmt_url_map[desired_format]
+
return video_url
end
#
# The default extension is .flv.
extension = '.flv'
-
+
if [18, 22, 35, 37].include?(@format)
extension = '.mp4'
elsif (@format == 17)
extension = '.3gp'
end
-
+
return (self.parse_video_id() + extension)
end
-
+
protected;
# Get the video id from the URL. Should be relatively easy,
end
- # Parse out the "t" parameter from the video's page. I'm not sure
- # what "t" stands for, but it's required for the final video URL to
- # work. It can be stored in either JSON or URL parameters.
- def parse_t_parameter(page_data)
- t_parameter = nil
-
- t_parameter_regexes = [ /\"t\"\:[[:space:]]\"([^\"]+?)\"/,
- /&t=([^&\"\\]+)/ ]
- matches = t_parameter_regexes.map { |tpr| tpr.match(page_data) }
-
- if matches.nitems == 0
- raise StandardError.new("Could not parse the 't' parameter.")
- end
- first_match = matches.compact[0]
- t_parameter = CGI::unescape(first_match[1])
-
- return t_parameter
- end
-
+ def get_format_url_map(page_data)
+ # Youtube has implemented a new fmt_url_map that (perhaps
+ # unsurprisingly) maps formats to video URLs. This makes it
+ # easyish to parse the video URLs.
+ url_map = {}
+ url_map_regex = /fmt_url_map=([^&\"]+)/
- def get_available_formats(page_data)
- # Parse the list of available formats from the "fmt_list"
- # variable. It can be stored as either a Flash variable (JSON
- # notation), or as URL parameter.
- available_formats = []
- fmt_list_regexes = [ /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/,
- /fmt_list=([^&\"\\]+)/ ]
+ matches = url_map_regex.match(page_data)
- matches = fmt_list_regexes.map { |flr| flr.match(page_data) }
+ if (matches.nil? || matches.length < 1)
+ raise StandardError.new("Could not parse the fmt_url_map Flash variable.")
+ end
- if matches.nitems == 0
- raise StandardError.new("Could not find any valid formats.")
+ # The map is stored entirely in one Flash variable. The format is
+ # key|value,key|value,...
+ maptext = CGI::unescape(matches[1])
+ entries = maptext.split(',')
+ entries.each do |entry|
+ key = entry.split('|')[0].to_i
+ value = entry.split('|')[1]
+ url_map[key] = value
end
- first_match = matches.compact[0]
- fmts_string = CGI::unescape(first_match[1])
-
- fmts_string.split(',').each do |fmt|
- # Each "fmt" will look something like,
- #
- # 35/640000/9/0/115
- #
- # with the format identifier coming before the first slash.
- first_slash_idx = fmt.index('/')
- available_formats << fmt[0...first_slash_idx].to_i
+ if (url_map.length < 1)
+ raise StandardError.new("Could not find any valid format URLs.")
end
-
- return available_formats
+
+ return url_map
end
return 18 if available_formats.include?(18)
return 34 if available_formats.include?(34)
return 17 if available_formats.include?(17)
+
+ # Available formats can't be empty (we would have raised an error
+ # in get_available_formats), so if there's some unknown format
+ # here we might as well return it as a last resort.
+ return available_formats[0]
end
-
+
end