#
require 'src/website'
-
+require 'cgi'
class Youtube < Website
- VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/))[a-z0-9_\-]+(\&.*)?\#?$/i
-
+ VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i
+
def self.owns_url?(url)
return url =~ VALID_YOUTUBE_URL_REGEX
end
-
+
+ def initialize(url)
+ super
+
+ # The @format variable just caches the format of the video we're
+ # downloading. Storing it will prevent us from having to calculate
+ # it twice.
+ @format = 0
+ end
+
+
def get_video_url()
video_id = self.parse_video_id()
page_data_url = "http://www.youtube.com/watch?v=#{video_id}"
page_data = self.get_page_data(page_data_url)
- # Magic.
- t_parameter = self.parse_t_parameter(page_data)
-
- video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}"
-
+ # Get the URL map from the page.
+ fmt_url_map = get_format_url_map(page_data)
+
+ # Figure out which formats are available, and if any are,
+ # choose the best one.
+ available_formats = fmt_url_map.keys()
+ desired_format = get_desired_format(available_formats)
+
+ # First we cache the format so that when we're asked for the
+ # video filename later, we don't have to recompute the format.
+ @format = desired_format
+
+ # And then use whatever URL is available for the desired format.
+ # We assume that all available formats will have an entry in the
+ # fmt_url_map hash.
+ video_url = fmt_url_map[desired_format]
+
return video_url
end
def get_video_filename()
- return (self.parse_video_id() + '.flv')
+ # The format -> extension mapping is available on Wikipedia:
+ #
+ # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs
+ #
+ # The default extension is .flv.
+ extension = '.flv'
+
+ if [18, 22, 35, 37].include?(@format)
+ extension = '.mp4'
+ elsif (@format == 17)
+ extension = '.3gp'
+ end
+
+ return (self.parse_video_id() + extension)
end
-
+
protected;
# Get the video id from the URL. Should be relatively easy,
# unless Youtube supports some URL formats of which I'm unaware.
def parse_video_id()
- # Return nil if we get no matches below.
- video_id = nil
-
# Both URLs are fairly easy to parse if you handle
# them one at a time. The only tricky situation is when
# parameters like "&hl=en" are tacked on to the end.
# First form didn't work? Try the second.
second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i
second_form_matches = second_form_video_id_regex.match(@url)
- video_id = second_form_matches[1] if not (second_form_matches.nil? ||
- second_form_matches.length < 2)
-
- return video_id
+ return second_form_matches[1] if not (second_form_matches.nil? ||
+ second_form_matches.length < 2)
+
+ # ...and the third.
+ third_form_video_id_regex = /\/([[:alnum:]]+)$/i
+ third_form_matches = third_form_video_id_regex.match(@url)
+ return third_form_matches[1] if not (third_form_matches.nil? ||
+ third_form_matches.length < 2)
+
+ # If we made it here, we couldn't figure out the video id. Yes,
+ # this is fatal, since we don't know where the video file is
+ # located.
+ raise StandardError.new("Could not parse the video id.")
end
- # Parse out the "t" parameter from the video's page. I'm not sure
- # what "t" stands for, but it's located in some JSON, and is required
- # for the final video URL to work.
- def parse_t_parameter(page_data)
- t_parameter = nil
-
- t_parameter_regex = /\"t\"\:[[:space:]]\"([^\"]+?)\"/
- matches = t_parameter_regex.match(page_data)
- t_parameter = matches[1] if not (matches.nil? || matches.length < 2)
- return t_parameter
+ def get_format_url_map(page_data)
+ # Youtube has implemented a new fmt_url_map that (perhaps
+ # unsurprisingly) maps formats to video URLs. This makes it
+ # easyish to parse the video URLs.
+ url_map = {}
+ url_map_regex = /fmt_url_map=([^&\"]+)/
+
+ matches = url_map_regex.match(page_data)
+
+ if (matches.nil? || matches.length < 1)
+ raise StandardError.new("Could not parse the fmt_url_map Flash variable.")
+ end
+
+ # The map is stored entirely in one Flash variable. The format is
+ # key|value,key|value,...
+ maptext = CGI::unescape(matches[1])
+ entries = maptext.split(',')
+ entries.each do |entry|
+ key = entry.split('|')[0].to_i
+ value = entry.split('|')[1]
+ url_map[key] = value
+ end
+
+ if (url_map.length < 1)
+ raise StandardError.new("Could not find any valid format URLs.")
+ end
+
+ return url_map
+ end
+
+
+ def get_desired_format(available_formats)
+ # Check for the presence of formats, in order of preference
+ # (quality). That is, we check for the best formats first. As soon
+ # as a format is found to be available, we return it as the
+ # desired format, since the first format we find is going to be
+ # the best available format.
+ return 37 if available_formats.include?(37)
+ return 22 if available_formats.include?(22)
+ return 35 if available_formats.include?(35)
+ return 18 if available_formats.include?(18)
+ return 34 if available_formats.include?(34)
+ return 17 if available_formats.include?(17)
+
+ # Available formats can't be empty (we would have raised an error
+ # in get_available_formats), so if there's some unknown format
+ # here we might as well return it as a last resort.
+ return available_formats[0]
end
-
-
+
end