Add Youtube support for the "fmt_list" and "t" variables stored as URL parameters.

[dead/whatever-dl.git] / src / websites / youtube.rb
diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb

index 532d8cae4653560fec7341d91d2328ca47027e12..ed5ab4b8a96efa62ee49379caf205f9b89c5e69c 100644 (file)
--- a/src/websites/youtube.rb
+++ b/src/websites/youtube.rb
@@ -21,7 +21,7 @@ require 'cgi'
  
  class Youtube < Website
  
-  VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/))[a-z0-9_\-]+(\&.*)?\#?$/i
+  VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i
    
    def self.owns_url?(url)
      return url =~ VALID_YOUTUBE_URL_REGEX
@@ -95,9 +95,6 @@ class Youtube < Website
    # Get the video id from the URL. Should be relatively easy,
    # unless Youtube supports some URL formats of which I'm unaware.
    def parse_video_id()
-    # Return nil if we get no matches below.
-    video_id = nil
-    
      # Both URLs are fairly easy to parse if you handle
      # them one at a time. The only tricky situation is when
      # parameters like "&hl=en" are tacked on to the end.
@@ -110,49 +107,68 @@ class Youtube < Website
      # First form didn't work? Try the second.
      second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i
      second_form_matches = second_form_video_id_regex.match(@url)
-    video_id = second_form_matches[1] if not (second_form_matches.nil? ||
-                                              second_form_matches.length < 2)
-    
-    return video_id
+    return second_form_matches[1] if not (second_form_matches.nil? ||
+                                          second_form_matches.length < 2)
+
+    # ...and the third.
+    third_form_video_id_regex = /\/([[:alnum:]]+)$/i
+    third_form_matches = third_form_video_id_regex.match(@url)
+    return third_form_matches[1] if not (third_form_matches.nil? ||
+                                         third_form_matches.length < 2)
+
+    # If we made it here, we couldn't figure out the video id. Yes,
+    # this is fatal, since we don't know where the video file is
+    # located.
+    raise StandardError.new("Could not parse the video id.")
    end
  
  
    # Parse out the "t" parameter from the video's page. I'm not sure
-  # what "t" stands for, but it's located in some JSON, and is required
-  # for the final video URL to work.
+  # what "t" stands for, but it's required for the final video URL to
+  # work. It can be stored in either JSON or URL parameters.
    def parse_t_parameter(page_data)
      t_parameter = nil
      
-    t_parameter_regex = /\"t\"\:[[:space:]]\"([^\"]+?)\"/
-    matches = t_parameter_regex.match(page_data)
-    t_parameter = matches[1] if not (matches.nil? || matches.length < 2)
+    t_parameter_regexes = [ /\"t\"\:[[:space:]]\"([^\"]+?)\"/,
+                            /&t=([^&\"\\]+)/ ]
+    matches = t_parameter_regexes.map { |tpr| tpr.match(page_data) }
+
+    if matches.nitems == 0
+      raise StandardError.new("Could not parse the 't' parameter.")
+    end
+
+    first_match = matches.compact[0]
+    t_parameter = CGI::unescape(first_match[1])
  
      return t_parameter
    end
    
  
    def get_available_formats(page_data)
-    # Parse the list of available formats from the "fmt_list" Flash
-    # variable.
+    # Parse the list of available formats from the "fmt_list"
+    # variable. It can be stored as either a Flash variable (JSON
+    # notation), or as URL parameter.
      available_formats = []
-    fmt_list_regex = /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/
-    matches = fmt_list_regex.match(page_data)
-
-    if matches.nil?
-      return nil
-    else
-      fmts_string = CGI::unescape(matches[1])
-
-      fmts_string.split(',').each do |fmt|
-        # Each "fmt" will look something like,
-        #
-        #   35/640000/9/0/115
-        #
-        # with the format identifier coming before the first slash.
-        first_slash_idx = fmt.index('/')
-        available_formats << fmt[0...first_slash_idx].to_i
-      end
-      
+    fmt_list_regexes = [ /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/,
+                         /fmt_list=([^&\"\\]+)/ ]
+
+    matches = fmt_list_regexes.map { |flr| flr.match(page_data) }
+
+    if matches.nitems == 0
+      raise StandardError.new("Could not find any valid formats.")
+    end
+
+    first_match = matches.compact[0]
+    fmts_string = CGI::unescape(first_match[1])
+
+    fmts_string.split(',').each do |fmt|
+      # Each "fmt" will look something like,
+      #
+      #   35/640000/9/0/115
+      #
+      # with the format identifier coming before the first slash.
+      first_slash_idx = fmt.index('/')
+      available_formats << fmt[0...first_slash_idx].to_i
      end
      
      return available_formats