]> gitweb.michael.orlitzky.com - dead/whatever-dl.git/blob - src/websites/youtube.rb
Re-fix the Youtube itag parameter. It works in both duplicated and non-duplicated...
[dead/whatever-dl.git] / src / websites / youtube.rb
1 #
2 # Copyright Michael Orlitzky
3 #
4 # http://michael.orlitzky.com/
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # http://www.fsf.org/licensing/licenses/gpl.html
17 #
18
19 require 'src/website'
20 require 'cgi'
21
22 class Youtube < Website
23
24 VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i
25
26 def self.owns_url?(url)
27 return url =~ VALID_YOUTUBE_URL_REGEX
28 end
29
30
31 def initialize(url)
32 super
33
34 # The @format variable just caches the format of the video we're
35 # downloading. Storing it will prevent us from having to calculate
36 # it twice.
37 @format = 0
38 end
39
40
41 def get_video_url()
42 video_id = self.parse_video_id()
43
44 # The video's URL (the "page data" URL) may be different from the
45 # URL that was passed to the program. We support the /v/video_id
46 # URL format, but that is *not* the main video page where we can
47 # retrieve the "t" parameter. We can only get that from the
48 # /watch?v=video_id form.
49 page_data_url = "http://www.youtube.com/watch?v=#{video_id}"
50 page_data = self.get_page_data(page_data_url)
51
52 begin
53 # Get the URL map from the page.
54 fmt_url_map = get_format_url_map(page_data)
55
56 # Figure out which formats are available, and if any are,
57 # choose the best one.
58 available_formats = fmt_url_map.keys()
59 desired_format = get_desired_format(available_formats)
60
61 # First we cache the format so that when we're asked for the
62 # video filename later, we don't have to recompute the format.
63 @format = desired_format
64
65 # And then use whatever URL is available for the desired format.
66 # We assume that all available formats will have an entry in the
67 # fmt_url_map hash.
68 video_url = fmt_url_map[desired_format]
69 return video_url
70 rescue StandardError => e
71 # If at first you do not succeed, maybe someone decided to
72 # change some shit. This alternate method parses
73 # url_encoded_fmt_stream_map.
74 fmt_streams = get_fmt_stream_list(page_data)
75 video_url = self.choose_best_fmt_stream_url(fmt_streams)
76
77 # A duplicated "itag" parameter results in a 403.
78 itag_regex = /&itag=\d+/
79 matches = video_url.scan(itag_regex)
80
81 if matches.length > 1
82 # Get rid of the first occurrence.
83 video_url.sub!(itag_regex, '')
84 end
85 end
86
87 return video_url
88 end
89
90
91 def get_video_filename()
92 # The format -> extension mapping is available on Wikipedia:
93 #
94 # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs
95 #
96 # The default extension is .flv.
97 extension = '.flv'
98
99 if [18, 22, 35, 37].include?(@format)
100 extension = '.mp4'
101 elsif (@format == 17)
102 extension = '.3gp'
103 end
104
105 return (self.parse_video_id() + extension)
106 end
107
108
109 protected;
110
111 def choose_best_fmt_stream_url(fmt_stream_urls)
112 # Take a list, generated by get_fmt_stream_list(), and choose the
113 # best URL out of the bunch based on the video format.
114 fmt_stream_urls.each do |fs|
115 if fs =~ /video\/mp4/ and fs =~ /quality=large/
116 return fs
117 elsif fs =~ /quality=large/
118 return fs
119 elsif fs =~ /video\/mp4/
120 return fs
121 else
122 return fs
123 end
124 end
125 end
126
127
128 def unicode_unescape(string)
129 # Unescape sequences like '\u0026'.
130 # Ok, only '\u0026' for now.
131 return string.gsub('\u0026', '&')
132 end
133
134
135 def get_fmt_stream_list(page_data)
136 # This is another (new?) method of embedding the video URLs.
137 # The url_encoded_fmt_stream_map variable contains a list of URLs
138 # in the form url=foo1,url=foo2...
139 #
140 # It looks like the first one in the list is the highest
141 # quality? Let's just take that one for now.
142 fmt_stream_regex = /\"url_encoded_fmt_stream_map\": \"(.+?)\"/
143
144 matches = fmt_stream_regex.match(page_data)
145
146 if (matches.nil? || matches.length < 2)
147 raise StandardError.new("Could not parse the url_encoded_fmt_stream_map Flash variable.")
148 end
149
150 urlstring = matches[1]
151 urlstring.gsub!('url=', '')
152 urls = urlstring.split(',')
153
154 urls.each_index do |idx|
155 urls[idx] = self.unicode_unescape(urls[idx])
156 urls[idx] = CGI::unescape(urls[idx])
157 # Strip off everything after the first space in the URL.
158 # I don't know why this works, but if we leave the space
159 # in (encoded, even), Youtube throws us 403 errors.
160 urls[idx].gsub!(/ .+$/, '')
161 end
162
163 return urls
164 end
165
166
167 # Get the video id from the URL. Should be relatively easy,
168 # unless Youtube supports some URL formats of which I'm unaware.
169 def parse_video_id()
170 # Both URLs are fairly easy to parse if you handle
171 # them one at a time. The only tricky situation is when
172 # parameters like "&hl=en" are tacked on to the end.
173 # We'll call /watch?v=video_id the "first form."
174 first_form_video_id_regex = /v=([0-9a-z_\-]+)/i
175 first_form_matches = first_form_video_id_regex.match(@url)
176 return first_form_matches[1] if not (first_form_matches.nil? ||
177 first_form_matches.length < 2)
178
179 # First form didn't work? Try the second.
180 second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i
181 second_form_matches = second_form_video_id_regex.match(@url)
182 return second_form_matches[1] if not (second_form_matches.nil? ||
183 second_form_matches.length < 2)
184
185 # ...and the third.
186 third_form_video_id_regex = /\/([[:alnum:]]+)$/i
187 third_form_matches = third_form_video_id_regex.match(@url)
188 return third_form_matches[1] if not (third_form_matches.nil? ||
189 third_form_matches.length < 2)
190
191 # If we made it here, we couldn't figure out the video id. Yes,
192 # this is fatal, since we don't know where the video file is
193 # located.
194 raise StandardError.new("Could not parse the video id.")
195 end
196
197
198
199 def get_format_url_map(page_data)
200 # Youtube has implemented a new fmt_url_map that (perhaps
201 # unsurprisingly) maps formats to video URLs. This makes it
202 # easyish to parse the video URLs.
203 url_map = {}
204 url_map_regex = /fmt_url_map=([^&\"]+)/
205
206 matches = url_map_regex.match(page_data)
207
208 if (matches.nil? || matches.length < 1)
209 raise StandardError.new("Could not parse the fmt_url_map Flash variable.")
210 end
211
212 # The map is stored entirely in one Flash variable. The format is
213 # key|value,key|value,...
214 maptext = CGI::unescape(matches[1])
215 entries = maptext.split(',')
216 entries.each do |entry|
217 key = entry.split('|')[0].to_i
218 value = entry.split('|')[1]
219 url_map[key] = value
220 end
221
222 if (url_map.length < 1)
223 raise StandardError.new("Could not find any valid format URLs.")
224 end
225
226 return url_map
227 end
228
229
230 def get_desired_format(available_formats)
231 # Check for the presence of formats, in order of preference
232 # (quality). That is, we check for the best formats first. As soon
233 # as a format is found to be available, we return it as the
234 # desired format, since the first format we find is going to be
235 # the best available format.
236 return 37 if available_formats.include?(37)
237 return 22 if available_formats.include?(22)
238 return 35 if available_formats.include?(35)
239 return 18 if available_formats.include?(18)
240 return 34 if available_formats.include?(34)
241 return 17 if available_formats.include?(17)
242
243 # Available formats can't be empty (we would have raised an error
244 # in get_available_formats), so if there's some unknown format
245 # here we might as well return it as a last resort.
246 return available_formats[0]
247 end
248
249 end