2 # Copyright Michael Orlitzky
4 # http://michael.orlitzky.com/
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # http://www.fsf.org/licensing/licenses/gpl.html
22 class Youtube
< Website
24 VALID_YOUTUBE_URL_REGEX
= /^(http:\/\
/)?([a-z0-9]+\.)?youtube\.com\/((watch
\?v
=)|(v\
/)|([a-z]+\#[a-z]\/[a-z
]\
/[0-9]\/))[a-z0-9_\
-]+
(\
&.*)?\
#?$/i
26 def self.owns_url
?(url
)
27 return url
=~ VALID_YOUTUBE_URL_REGEX
34 # The @format variable just caches the format of the video we're
35 # downloading. Storing it will prevent us from having to calculate
42 video_id
= self.parse_video_id()
44 # The video's URL (the "page data" URL) may be different from the
45 # URL that was passed to the program. We support the /v/video_id
46 # URL format, but that is *not* the main video page where we can
47 # retrieve the "t" parameter. We can only get that from the
48 # /watch?v=video_id form.
49 page_data_url
= "http://www.youtube.com/watch?v=#{video_id}"
50 page_data
= self.get_page_data(page_data_url
)
53 # Get the URL map from the page.
54 fmt_url_map
= get_format_url_map(page_data
)
56 # Figure out which formats are available, and if any are,
57 # choose the best one.
58 available_formats
= fmt_url_map
.keys()
59 desired_format
= get_desired_format(available_formats
)
61 # First we cache the format so that when we're asked for the
62 # video filename later, we don't have to recompute the format.
63 @format = desired_format
65 # And then use whatever URL is available for the desired format.
66 # We assume that all available formats will have an entry in the
68 video_url
= fmt_url_map
[desired_format
]
71 # If at first you do not succeed, maybe someone decided to
72 # change some shit. This alternate method parses
73 # url_encoded_fmt_stream_map.
74 fmt_streams
= get_fmt_stream_list(page_data
)
75 video_url
= self.choose_best_fmt_stream_url(fmt_streams
)
77 # A duplicated "itag" parameter results in a 403.
78 itag_regex
= /&itag=\d+/
79 matches
= video_url
.scan(itag_regex
)
82 # Get rid of the first occurrence.
83 video_url
.sub!
(itag_regex
, '')
91 def get_video_filename()
92 # The format -> extension mapping is available on Wikipedia:
94 # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs
96 # The default extension is .flv.
99 if [18, 22, 35, 37].include?(@format)
101 elsif (@format == 17)
105 return (self.parse_video_id() + extension
)
111 def choose_best_fmt_stream_url(fmt_stream_urls
)
112 # Take a list, generated by get_fmt_stream_list(), and choose the
113 # best URL out of the bunch based on the video format.
114 fmt_stream_urls
.each
do |fs
|
115 if fs
=~
/video\/mp4
/ and fs =~ /quality
=large
/
117 elsif fs
=~
/quality=large/
119 elsif fs
=~
/video\/mp4
/
128 def unicode_unescape(string
)
129 # Unescape sequences like '\u0026'.
130 # Ok, only '\u0026' for now.
131 return string
.gsub('\u0026', '&')
135 def get_fmt_stream_list(page_data
)
136 # This is another (new?) method of embedding the video URLs.
137 # The url_encoded_fmt_stream_map variable contains a list of URLs
138 # in the form url=foo1,url=foo2...
140 # It looks like the first one in the list is the highest
141 # quality? Let's just take that one for now.
142 fmt_stream_regex
= /\"url_encoded_fmt_stream_map\": \"(.+?)\"/
144 matches
= fmt_stream_regex
.match(page_data
)
146 if (matches
.nil? || matches
.length
< 2)
147 raise StandardError
.new("Could not parse the url_encoded_fmt_stream_map Flash variable.")
150 urlstring
= matches
[1]
151 urlstring
.gsub!
('url=', '')
152 urls
= urlstring
.split(',')
154 urls
.each_index
do |idx
|
155 urls
[idx
] = self.unicode_unescape(urls
[idx
])
156 urls
[idx
] = CGI
::unescape(urls
[idx
])
157 # Strip off everything after the first space in the URL.
158 # I don't know why this works, but if we leave the space
159 # in (encoded, even), Youtube throws us 403 errors.
160 urls
[idx
].gsub!
(/ .+$/, '')
167 # Get the video id from the URL. Should be relatively easy,
168 # unless Youtube supports some URL formats of which I'm unaware.
170 # Both URLs are fairly easy to parse if you handle
171 # them one at a time. The only tricky situation is when
172 # parameters like "&hl=en" are tacked on to the end.
173 # We'll call /watch?v=video_id the "first form."
174 first_form_video_id_regex
= /v=([0-9a-z_\-]+)/i
175 first_form_matches
= first_form_video_id_regex
.match(@url)
176 if not first_form_matches
.nil? || first_form_matches
.length
< 2
177 return first_form_matches
[1]
180 # First form didn't work? Try the second.
181 second_form_video_id_regex
= /\/v\
/([0-9a-z_\-]+)/i
182 second_form_matches
= second_form_video_id_regex
.match(@url)
183 if not second_form_matches
.nil? || second_form_matches
.length
< 2
184 return second_form_matches
[1]
188 third_form_video_id_regex
= /\/([[:alnum:]]+
)$/i
189 third_form_matches
= third_form_video_id_regex
.match(@url)
190 if not third_form_matches
.nil? || third_form_matches
.length
< 2
191 return third_form_matches
[1]
194 # If we made it here, we couldn't figure out the video id. Yes,
195 # this is fatal, since we don't know where the video file is
197 raise StandardError
.new("Could not parse the video id.")
202 def get_format_url_map(page_data
)
203 # Youtube has implemented a new fmt_url_map that (perhaps
204 # unsurprisingly) maps formats to video URLs. This makes it
205 # easyish to parse the video URLs.
207 url_map_regex
= /fmt_url_map=([^&\"]+)/
209 matches
= url_map_regex
.match(page_data
)
211 if (matches
.nil? || matches
.length
< 1)
212 raise StandardError
.new("Could not parse the fmt_url_map Flash variable.")
215 # The map is stored entirely in one Flash variable. The format is
216 # key|value,key|value,...
217 maptext
= CGI
::unescape(matches
[1])
218 entries
= maptext
.split(',')
219 entries
.each
do |entry
|
220 key
= entry
.split('|')[0].to_i
221 value
= entry
.split('|')[1]
225 if (url_map
.length
< 1)
226 raise StandardError
.new("Could not find any valid format URLs.")
233 def get_desired_format(available_formats
)
234 # Check for the presence of formats, in order of preference
235 # (quality). That is, we check for the best formats first. As soon
236 # as a format is found to be available, we return it as the
237 # desired format, since the first format we find is going to be
238 # the best available format.
239 return 37 if available_formats
.include?(37)
240 return 22 if available_formats
.include?(22)
241 return 35 if available_formats
.include?(35)
242 return 18 if available_formats
.include?(18)
243 return 34 if available_formats
.include?(34)
244 return 17 if available_formats
.include?(17)
246 # Available formats can't be empty (we would have raised an error
247 # in get_available_formats), so if there's some unknown format
248 # here we might as well return it as a last resort.
249 return available_formats
[0]