#
require 'src/website'
-
-# Needed to download the page, which is in turn
-# needed because it contains the video URL.
-require 'net/http'
-require 'uri'
-
+require 'cgi'
class Youtube < Website
- VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?(www\.)?youtube\.com\/((watch\?v=)|(v\/))[[:alnum:]]+(\&.*)?\#?$/
+ VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/))[a-z0-9_\-]+(\&.*)?\#?$/i
def self.owns_url?(url)
return url =~ VALID_YOUTUBE_URL_REGEX
end
+
+ def initialize(url)
+ super
+
+ # The @format variable just caches the format of the video we're
+ # downloading. Storing it will prevent us from having to calculate
+ # it twice.
+ @format = 0
+ end
+
- def get_video_url(url)
- video_id = self.parse_video_id(url)
+ def get_video_url()
+ video_id = self.parse_video_id()
# The video's URL (the "page data" URL) may be different from the
# URL that was passed to the program. We support the /v/video_id
t_parameter = self.parse_t_parameter(page_data)
video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}"
-
+
+ # Figure out which formats are available, and if any are,
+ # choose the best one.
+ available_formats = get_available_formats(page_data)
+ desired_format = get_desired_format(available_formats)
+
+ if not desired_format.nil?
+ # First we cache the format so that when we're asked for the
+ # video filename later, we don't have to recompute the format.
+ @format = desired_format
+
+ # And then stick the format parameter on the end of the URL.
+ video_url = video_url + "&fmt=#{desired_format}"
+ end
+
return video_url
end
+
+ def get_video_filename()
+ # The format -> extension mapping is available on Wikipedia:
+ #
+ # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs
+ #
+ # The default extension is .flv.
+ extension = '.flv'
+
+ if [18, 22, 35, 37].include?(@format)
+ extension = '.mp4'
+ elsif (@format == 17)
+ extension = '.3gp'
+ end
+
+ return (self.parse_video_id() + extension)
+ end
+
protected;
# Get the video id from the URL. Should be relatively easy,
# unless Youtube supports some URL formats of which I'm unaware.
- def parse_video_id(url)
+ def parse_video_id()
# Return nil if we get no matches below.
video_id = nil
# them one at a time. The only tricky situation is when
# parameters like "&hl=en" are tacked on to the end.
# We'll call /watch?v=video_id the "first form."
- first_form_video_id_regex = /v=([[:alnum:]]+)$/
- first_form_matches = first_form_video_id_regex.match(url)
+ first_form_video_id_regex = /v=([0-9a-z_\-]+)/i
+ first_form_matches = first_form_video_id_regex.match(@url)
return first_form_matches[1] if not (first_form_matches.nil? ||
first_form_matches.length < 2)
# First form didn't work? Try the second.
- second_form_video_id_regex = /\/v\/([[:alnum:]]+)/
- second_form_matches = second_form_video_id_regex.match(url)
+ second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i
+ second_form_matches = second_form_video_id_regex.match(@url)
video_id = second_form_matches[1] if not (second_form_matches.nil? ||
second_form_matches.length < 2)
return t_parameter
end
-
- def get_page_data(url)
- uri = URI.parse(url)
- response = Net::HTTP.start(uri.host, uri.port) do |http|
- http.get(uri.request_uri)
+ def get_available_formats(page_data)
+ # Parse the list of available formats from the "fmt_list" Flash
+ # variable.
+ available_formats = []
+ fmt_list_regex = /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/
+ matches = fmt_list_regex.match(page_data)
+
+ if matches.nil?
+ return nil
+ else
+ fmts_string = CGI::unescape(matches[1])
+
+ fmts_string.split(',').each do |fmt|
+ # Each "fmt" will look something like,
+ #
+ # 35/640000/9/0/115
+ #
+ # with the format identifier coming before the first slash.
+ first_slash_idx = fmt.index('/')
+ available_formats << fmt[0...first_slash_idx].to_i
+ end
+
end
+
+ return available_formats
+ end
+
- return response.body
+ def get_desired_format(available_formats)
+ # Check for the presence of formats, in order of preference
+ # (quality). That is, we check for the best formats first. As soon
+ # as a format is found to be available, we return it as the
+ # desired format, since the first format we find is going to be
+ # the best available format.
+ return 37 if available_formats.include?(37)
+ return 22 if available_formats.include?(22)
+ return 35 if available_formats.include?(35)
+ return 18 if available_formats.include?(18)
+ return 34 if available_formats.include?(34)
+ return 17 if available_formats.include?(17)
end
end