--- /dev/null
+#
+# Copyright Michael Orlitzky
+#
+# http://michael.orlitzky.com/
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# http://www.fsf.org/licensing/licenses/gpl.html
+#
+
+require 'src/website'
+require 'cgi'
+
+class Youtube < Website
+
+ VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i
+
+ def self.owns_url?(url)
+ return url =~ VALID_YOUTUBE_URL_REGEX
+ end
+
+
+ def initialize(url)
+ super
+
+ # The @format variable just caches the format of the video we're
+ # downloading. Storing it will prevent us from having to calculate
+ # it twice.
+ @format = 0
+ end
+
+
+ def get_video_url()
+ video_id = self.parse_video_id()
+
+ # The video's URL (the "page data" URL) may be different from the
+ # URL that was passed to the program. We support the /v/video_id
+ # URL format, but that is *not* the main video page where we can
+ # retrieve the "t" parameter. We can only get that from the
+ # /watch?v=video_id form.
+ page_data_url = "http://www.youtube.com/watch?v=#{video_id}"
+ page_data = self.get_page_data(page_data_url)
+
+ begin
+ # Get the URL map from the page.
+ fmt_url_map = get_format_url_map(page_data)
+
+ # Figure out which formats are available, and if any are,
+ # choose the best one.
+ available_formats = fmt_url_map.keys()
+ desired_format = get_desired_format(available_formats)
+
+ # First we cache the format so that when we're asked for the
+ # video filename later, we don't have to recompute the format.
+ @format = desired_format
+
+ # And then use whatever URL is available for the desired format.
+ # We assume that all available formats will have an entry in the
+ # fmt_url_map hash.
+ video_url = fmt_url_map[desired_format]
+ return video_url
+ rescue StandardError
+ # If at first you do not succeed, maybe someone decided to
+ # change some shit. This alternate method parses
+ # url_encoded_fmt_stream_map.
+ fmt_streams = get_fmt_stream_list(page_data)
+ video_url = self.choose_best_fmt_stream_url(fmt_streams)
+
+ # A duplicated "itag" parameter results in a 403.
+ itag_regex = /&itag=\d+/
+ matches = video_url.scan(itag_regex)
+
+ if matches.length > 1
+ # Get rid of the first occurrence.
+ video_url.sub!(itag_regex, '')
+ end
+ end
+
+ return video_url
+ end
+
+
+ def get_video_filename()
+ # The format -> extension mapping is available on Wikipedia:
+ #
+ # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs
+ #
+ # The default extension is .flv.
+ extension = '.flv'
+
+ if [18, 22, 35, 37].include?(@format)
+ extension = '.mp4'
+ elsif (@format == 17)
+ extension = '.3gp'
+ end
+
+ return (self.parse_video_id() + extension)
+ end
+
+
+ protected;
+
+ def choose_best_fmt_stream_url(fmt_stream_urls)
+ # Take a list, generated by get_fmt_stream_list(), and choose the
+ # best URL out of the bunch based on the video format.
+ fmt_stream_urls.each do |fs|
+ if fs =~ /video\/mp4/ and fs =~ /quality=large/
+ return fs
+ elsif fs =~ /quality=large/
+ return fs
+ elsif fs =~ /video\/mp4/
+ return fs
+ else
+ return fs
+ end
+ end
+ end
+
+
+ def unicode_unescape(string)
+ # Unescape sequences like '\u0026'.
+ # Ok, only '\u0026' for now.
+ return string.gsub('\u0026', '&')
+ end
+
+
+ def get_fmt_stream_list(page_data)
+ # This is another (new?) method of embedding the video URLs.
+ # The url_encoded_fmt_stream_map variable contains a list of URLs
+ # in the form url=foo1,url=foo2...
+ #
+ # It looks like the first one in the list is the highest
+ # quality? Let's just take that one for now.
+ fmt_stream_regex = /\"url_encoded_fmt_stream_map\": \"(.+?)\"/
+
+ matches = fmt_stream_regex.match(page_data)
+
+ if (matches.nil? || matches.length < 2)
+ raise StandardError.new("Could not parse the url_encoded_fmt_stream_map Flash variable.")
+ end
+
+ urlstring = matches[1]
+ urlstring.gsub!('url=', '')
+ urls = urlstring.split(',')
+
+ urls.each_index do |idx|
+ urls[idx] = self.unicode_unescape(urls[idx])
+ urls[idx] = CGI::unescape(urls[idx])
+ # Strip off everything after the first space in the URL.
+ # I don't know why this works, but if we leave the space
+ # in (encoded, even), Youtube throws us 403 errors.
+ urls[idx].gsub!(/ .+$/, '')
+ end
+
+ return urls
+ end
+
+
+ # Get the video id from the URL. Should be relatively easy,
+ # unless Youtube supports some URL formats of which I'm unaware.
+ def parse_video_id()
+ # Both URLs are fairly easy to parse if you handle
+ # them one at a time. The only tricky situation is when
+ # parameters like "&hl=en" are tacked on to the end.
+ # We'll call /watch?v=video_id the "first form."
+ first_form_video_id_regex = /v=([0-9a-z_\-]+)/i
+ first_form_matches = first_form_video_id_regex.match(@url)
+ if not first_form_matches.nil? || first_form_matches.length < 2
+ return first_form_matches[1]
+ end
+
+ # First form didn't work? Try the second.
+ second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i
+ second_form_matches = second_form_video_id_regex.match(@url)
+ if not second_form_matches.nil? || second_form_matches.length < 2
+ return second_form_matches[1]
+ end
+
+ # ...and the third.
+ third_form_video_id_regex = /\/([[:alnum:]]+)$/i
+ third_form_matches = third_form_video_id_regex.match(@url)
+ if not third_form_matches.nil? || third_form_matches.length < 2
+ return third_form_matches[1]
+ end
+
+ # If we made it here, we couldn't figure out the video id. Yes,
+ # this is fatal, since we don't know where the video file is
+ # located.
+ raise StandardError.new("Could not parse the video id.")
+ end
+
+
+
+ def get_format_url_map(page_data)
+ # Youtube has implemented a new fmt_url_map that (perhaps
+ # unsurprisingly) maps formats to video URLs. This makes it
+ # easyish to parse the video URLs.
+ url_map = {}
+ url_map_regex = /fmt_url_map=([^&\"]+)/
+
+ matches = url_map_regex.match(page_data)
+
+ if (matches.nil? || matches.length < 1)
+ raise StandardError.new("Could not parse the fmt_url_map Flash variable.")
+ end
+
+ # The map is stored entirely in one Flash variable. The format is
+ # key|value,key|value,...
+ maptext = CGI::unescape(matches[1])
+ entries = maptext.split(',')
+ entries.each do |entry|
+ key = entry.split('|')[0].to_i
+ value = entry.split('|')[1]
+ url_map[key] = value
+ end
+
+ if (url_map.length < 1)
+ raise StandardError.new("Could not find any valid format URLs.")
+ end
+
+ return url_map
+ end
+
+
+ def get_desired_format(available_formats)
+ # Check for the presence of formats, in order of preference
+ # (quality). That is, we check for the best formats first. As soon
+ # as a format is found to be available, we return it as the
+ # desired format, since the first format we find is going to be
+ # the best available format.
+ return 37 if available_formats.include?(37)
+ return 22 if available_formats.include?(22)
+ return 35 if available_formats.include?(35)
+ return 18 if available_formats.include?(18)
+ return 34 if available_formats.include?(34)
+ return 17 if available_formats.include?(17)
+
+ # Available formats can't be empty (we would have raised an error
+ # in get_available_formats), so if there's some unknown format
+ # here we might as well return it as a last resort.
+ return available_formats[0]
+ end
+
+end