]> gitweb.michael.orlitzky.com - dead/whatever-dl.git/blob - src/websites/youtube.rb
Add Youtube support for the "fmt_list" and "t" variables stored as URL parameters.
[dead/whatever-dl.git] / src / websites / youtube.rb
1 #
2 # Copyright Michael Orlitzky
3 #
4 # http://michael.orlitzky.com/
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # http://www.fsf.org/licensing/licenses/gpl.html
17 #
18
19 require 'src/website'
20 require 'cgi'
21
22 class Youtube < Website
23
24 VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i
25
26 def self.owns_url?(url)
27 return url =~ VALID_YOUTUBE_URL_REGEX
28 end
29
30
31 def initialize(url)
32 super
33
34 # The @format variable just caches the format of the video we're
35 # downloading. Storing it will prevent us from having to calculate
36 # it twice.
37 @format = 0
38 end
39
40
41 def get_video_url()
42 video_id = self.parse_video_id()
43
44 # The video's URL (the "page data" URL) may be different from the
45 # URL that was passed to the program. We support the /v/video_id
46 # URL format, but that is *not* the main video page where we can
47 # retrieve the "t" parameter. We can only get that from the
48 # /watch?v=video_id form.
49 page_data_url = "http://www.youtube.com/watch?v=#{video_id}"
50 page_data = self.get_page_data(page_data_url)
51
52 # Magic.
53 t_parameter = self.parse_t_parameter(page_data)
54
55 video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}"
56
57 # Figure out which formats are available, and if any are,
58 # choose the best one.
59 available_formats = get_available_formats(page_data)
60 desired_format = get_desired_format(available_formats)
61
62 if not desired_format.nil?
63 # First we cache the format so that when we're asked for the
64 # video filename later, we don't have to recompute the format.
65 @format = desired_format
66
67 # And then stick the format parameter on the end of the URL.
68 video_url = video_url + "&fmt=#{desired_format}"
69 end
70
71 return video_url
72 end
73
74
75 def get_video_filename()
76 # The format -> extension mapping is available on Wikipedia:
77 #
78 # http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs
79 #
80 # The default extension is .flv.
81 extension = '.flv'
82
83 if [18, 22, 35, 37].include?(@format)
84 extension = '.mp4'
85 elsif (@format == 17)
86 extension = '.3gp'
87 end
88
89 return (self.parse_video_id() + extension)
90 end
91
92
93 protected;
94
95 # Get the video id from the URL. Should be relatively easy,
96 # unless Youtube supports some URL formats of which I'm unaware.
97 def parse_video_id()
98 # Both URLs are fairly easy to parse if you handle
99 # them one at a time. The only tricky situation is when
100 # parameters like "&hl=en" are tacked on to the end.
101 # We'll call /watch?v=video_id the "first form."
102 first_form_video_id_regex = /v=([0-9a-z_\-]+)/i
103 first_form_matches = first_form_video_id_regex.match(@url)
104 return first_form_matches[1] if not (first_form_matches.nil? ||
105 first_form_matches.length < 2)
106
107 # First form didn't work? Try the second.
108 second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i
109 second_form_matches = second_form_video_id_regex.match(@url)
110 return second_form_matches[1] if not (second_form_matches.nil? ||
111 second_form_matches.length < 2)
112
113 # ...and the third.
114 third_form_video_id_regex = /\/([[:alnum:]]+)$/i
115 third_form_matches = third_form_video_id_regex.match(@url)
116 return third_form_matches[1] if not (third_form_matches.nil? ||
117 third_form_matches.length < 2)
118
119 # If we made it here, we couldn't figure out the video id. Yes,
120 # this is fatal, since we don't know where the video file is
121 # located.
122 raise StandardError.new("Could not parse the video id.")
123 end
124
125
126 # Parse out the "t" parameter from the video's page. I'm not sure
127 # what "t" stands for, but it's required for the final video URL to
128 # work. It can be stored in either JSON or URL parameters.
129 def parse_t_parameter(page_data)
130 t_parameter = nil
131
132 t_parameter_regexes = [ /\"t\"\:[[:space:]]\"([^\"]+?)\"/,
133 /&t=([^&\"\\]+)/ ]
134 matches = t_parameter_regexes.map { |tpr| tpr.match(page_data) }
135
136 if matches.nitems == 0
137 raise StandardError.new("Could not parse the 't' parameter.")
138 end
139
140 first_match = matches.compact[0]
141 t_parameter = CGI::unescape(first_match[1])
142
143 return t_parameter
144 end
145
146
147 def get_available_formats(page_data)
148 # Parse the list of available formats from the "fmt_list"
149 # variable. It can be stored as either a Flash variable (JSON
150 # notation), or as URL parameter.
151 available_formats = []
152 fmt_list_regexes = [ /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/,
153 /fmt_list=([^&\"\\]+)/ ]
154
155 matches = fmt_list_regexes.map { |flr| flr.match(page_data) }
156
157 if matches.nitems == 0
158 raise StandardError.new("Could not find any valid formats.")
159 end
160
161 first_match = matches.compact[0]
162 fmts_string = CGI::unescape(first_match[1])
163
164 fmts_string.split(',').each do |fmt|
165 # Each "fmt" will look something like,
166 #
167 # 35/640000/9/0/115
168 #
169 # with the format identifier coming before the first slash.
170 first_slash_idx = fmt.index('/')
171 available_formats << fmt[0...first_slash_idx].to_i
172 end
173
174 return available_formats
175 end
176
177
178 def get_desired_format(available_formats)
179 # Check for the presence of formats, in order of preference
180 # (quality). That is, we check for the best formats first. As soon
181 # as a format is found to be available, we return it as the
182 # desired format, since the first format we find is going to be
183 # the best available format.
184 return 37 if available_formats.include?(37)
185 return 22 if available_formats.include?(22)
186 return 35 if available_formats.include?(35)
187 return 18 if available_formats.include?(18)
188 return 34 if available_formats.include?(34)
189 return 17 if available_formats.include?(17)
190 end
191
192 end