Added the ability to download Youtube videos.

[dead/whatever-dl.git] / src / websites / youtube.rb
diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb

new file mode 100644 (file)

index 0000000..6766af2
--- /dev/null
+++ b/src/websites/youtube.rb
@@ -0,0 +1,107 @@
+#
+# Copyright Michael Orlitzky
+#
+# http://michael.orlitzky.com/
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# http://www.fsf.org/licensing/licenses/gpl.html
+#
+
+require 'src/website'
+
+# Needed to download the page, which is in turn
+# needed because it contains the video URL.
+require 'net/http'
+require 'uri'
+
+
+class Youtube < Website
+
+  VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?(www\.)?youtube\.com\/((watch\?v=)|(v\/))[[:alnum:]]+(\&.*)?$/
+  
+  def self.owns_url?(url)
+    return url =~ VALID_YOUTUBE_URL_REGEX
+  end
+
+  
+  def get_video_url(url)
+    video_id = self.parse_video_id(url)
+
+    # The video's URL (the "page data" URL) may be different from the
+    # URL that was passed to the program. We support the /v/video_id
+    # URL format, but that is *not* the main video page where we can
+    # retrieve the "t" parameter. We can only get that from the
+    # /watch?v=video_id form.
+    page_data_url = "http://www.youtube.com/watch?v=#{video_id}"
+    page_data = self.get_page_data(page_data_url)
+
+    # Magic.
+    t_parameter = self.parse_t_parameter(page_data)
+    
+    video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}"
+      
+    return video_url
+  end
+
+  
+  protected;
+
+  # Get the video id from the URL. Should be relatively easy,
+  # unless Youtube supports some URL formats of which I'm unaware.
+  def parse_video_id(url)
+    # Return nil if we get no matches below.
+    video_id = nil
+    
+    # Both URLs are fairly easy to parse if you handle
+    # them one at a time. The only tricky situation is when
+    # parameters like "&hl=en" are tacked on to the end.
+    # We'll call /watch?v=video_id the "first form."
+    first_form_video_id_regex = /v=([[:alnum:]]+)$/
+    first_form_matches = first_form_video_id_regex.match(url)
+    return first_form_matches[1] if not (first_form_matches.nil? ||
+                                         first_form_matches.length < 2)
+
+    # First form didn't work? Try the second.
+    second_form_video_id_regex = /\/v\/([[:alnum:]]+)/
+    second_form_matches = second_form_video_id_regex.match(url)
+    video_id = second_form_matches[1] if not (second_form_matches.nil? ||
+                                              second_form_matches.length < 2)
+    
+    return video_id
+  end
+
+
+  # Parse out the "t" parameter from the video's page. I'm not sure
+  # what "t" stands for, but it's located in some JSON, and is required
+  # for the final video URL to work.
+  def parse_t_parameter(page_data)
+    t_parameter = nil
+    
+    t_parameter_regex = /\"t\"\:[[:space:]]\"([[:alnum:]]+)\"/
+    matches = t_parameter_regex.match(page_data)
+    t_parameter = matches[1] if not (matches.nil? || matches.length < 2)
+
+    return t_parameter
+  end
+
+  
+  def get_page_data(url)
+    uri = URI.parse(url)
+
+    response = Net::HTTP.start(uri.host, uri.port) do |http|
+      http.get(uri.request_uri)
+    end
+
+    return response.body
+  end
+  
+end