page_data_url = "http://www.youtube.com/watch?v=#{video_id}"
page_data = self.get_page_data(page_data_url)
- # Get the URL map from the page.
- fmt_url_map = get_format_url_map(page_data)
-
- # Figure out which formats are available, and if any are,
- # choose the best one.
- available_formats = fmt_url_map.keys()
- desired_format = get_desired_format(available_formats)
-
- # First we cache the format so that when we're asked for the
- # video filename later, we don't have to recompute the format.
- @format = desired_format
-
- # And then use whatever URL is available for the desired format.
- # We assume that all available formats will have an entry in the
- # fmt_url_map hash.
- video_url = fmt_url_map[desired_format]
+ begin
+ # Get the URL map from the page.
+ fmt_url_map = get_format_url_map(page_data)
+
+ # Figure out which formats are available, and if any are,
+ # choose the best one.
+ available_formats = fmt_url_map.keys()
+ desired_format = get_desired_format(available_formats)
+
+ # First we cache the format so that when we're asked for the
+ # video filename later, we don't have to recompute the format.
+ @format = desired_format
+
+ # And then use whatever URL is available for the desired format.
+ # We assume that all available formats will have an entry in the
+ # fmt_url_map hash.
+ video_url = fmt_url_map[desired_format]
+ return video_url
+ rescue StandardError => e
+ # If at first you do not succeed, maybe someone decided to
+ # change some shit. This alternate method parses
+ # url_encoded_fmt_stream_map.
+ fmt_streams = get_fmt_stream_list(page_data)
+ video_url = self.choose_best_fmt_stream_url(fmt_streams)
+
+ # A duplicated "itag" parameter results in a 403.
+ itag_regex = /&itag=\d+/
+ matches = video_url.scan(itag_regex)
+
+ if matches.length > 1
+ # Get rid of the first occurrence.
+ video_url.sub!(itag_regex, '')
+ end
+ end
return video_url
end
protected;
+ def choose_best_fmt_stream_url(fmt_stream_urls)
+ # Take a list, generated by get_fmt_stream_list(), and choose the
+ # best URL out of the bunch based on the video format.
+ fmt_stream_urls.each do |fs|
+ if fs =~ /video\/mp4/ and fs =~ /quality=large/
+ return fs
+ elsif fs =~ /quality=large/
+ return fs
+ elsif fs =~ /video\/mp4/
+ return fs
+ else
+ return fs
+ end
+ end
+ end
+
+
+ def unicode_unescape(string)
+ # Unescape sequences like '\u0026'.
+ # Ok, only '\u0026' for now.
+ return string.gsub('\u0026', '&')
+ end
+
+
+ def get_fmt_stream_list(page_data)
+ # This is another (new?) method of embedding the video URLs.
+ # The url_encoded_fmt_stream_map variable contains a list of URLs
+ # in the form url=foo1,url=foo2...
+ #
+ # It looks like the first one in the list is the highest
+ # quality? Let's just take that one for now.
+ fmt_stream_regex = /\"url_encoded_fmt_stream_map\": \"(.+?)\"/
+
+ matches = fmt_stream_regex.match(page_data)
+
+ if (matches.nil? || matches.length < 2)
+ raise StandardError.new("Could not parse the url_encoded_fmt_stream_map Flash variable.")
+ end
+
+ urlstring = matches[1]
+ urlstring.gsub!('url=', '')
+ urls = urlstring.split(',')
+
+ urls.each_index do |idx|
+ urls[idx] = self.unicode_unescape(urls[idx])
+ urls[idx] = CGI::unescape(urls[idx])
+ # Strip off everything after the first space in the URL.
+ # I don't know why this works, but if we leave the space
+ # in (encoded, even), Youtube throws us 403 errors.
+ urls[idx].gsub!(/ .+$/, '')
+ end
+
+ return urls
+ end
+
+
# Get the video id from the URL. Should be relatively easy,
# unless Youtube supports some URL formats of which I'm unaware.
def parse_video_id()