Raise errors if the Youtube class can't parse either the video id or the format list.

[dead/whatever-dl.git] / src / websites / youtube.rb
diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb

index b9d9aa6c5e7ec586bb69b4d851428d975a006156..5f87754f9de85e69a3124f22c192f7df57e20a93 100644 (file)
--- a/src/websites/youtube.rb
+++ b/src/websites/youtube.rb
@@ -17,21 +17,26 @@
  #
  
  require 'src/website'
-
-# Needed to download the page, which is in turn
-# needed because it contains the video URL.
-require 'net/http'
-require 'uri'
-
+require 'cgi'
  
  class Youtube < Website
  
-  VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?(www\.)?youtube\.com\/((watch\?v=)|(v\/))[a-z0-9_\-]+(\&.*)?\#?$/i
+  VALID_YOUTUBE_URL_REGEX = /^(http:\/\/)?([a-z0-9]+\.)?youtube\.com\/((watch\?v=)|(v\/)|([a-z]+\#[a-z]\/[a-z]\/[0-9]\/))[a-z0-9_\-]+(\&.*)?\#?$/i
    
    def self.owns_url?(url)
      return url =~ VALID_YOUTUBE_URL_REGEX
    end
  
+
+  def initialize(url)
+    super
+    
+    # The @format variable just caches the format of the video we're
+    # downloading. Storing it will prevent us from having to calculate
+    # it twice.
+    @format = 0
+  end
+
    
    def get_video_url()
      video_id = self.parse_video_id()
@@ -48,13 +53,40 @@ class Youtube < Website
      t_parameter = self.parse_t_parameter(page_data)
      
      video_url = "http://www.youtube.com/get_video?video_id=#{video_id}&t=#{t_parameter}"
-      
+
+    # Figure out which formats are available, and if any are,
+    # choose the best one.
+    available_formats = get_available_formats(page_data)
+    desired_format = get_desired_format(available_formats)
+    
+    if not desired_format.nil?
+      # First we cache the format so that when we're asked for the
+      # video filename later, we don't have to recompute the format.
+      @format = desired_format
+
+      # And then stick the format parameter on the end of the URL.
+      video_url = video_url + "&fmt=#{desired_format}"
+    end
+    
      return video_url
    end
  
  
    def get_video_filename()
-    return (self.parse_video_id() + '.flv')
+    # The format -> extension mapping is available on Wikipedia:
+    #
+    #   http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs
+    #
+    # The default extension is .flv.
+    extension = '.flv'
+    
+    if [18, 22, 35, 37].include?(@format)
+      extension = '.mp4'
+    elsif (@format == 17)
+      extension = '.3gp'
+    end
+    
+    return (self.parse_video_id() + extension)
    end
  
    
@@ -63,9 +95,6 @@ class Youtube < Website
    # Get the video id from the URL. Should be relatively easy,
    # unless Youtube supports some URL formats of which I'm unaware.
    def parse_video_id()
-    # Return nil if we get no matches below.
-    video_id = nil
-    
      # Both URLs are fairly easy to parse if you handle
      # them one at a time. The only tricky situation is when
      # parameters like "&hl=en" are tacked on to the end.
@@ -78,10 +107,19 @@ class Youtube < Website
      # First form didn't work? Try the second.
      second_form_video_id_regex = /\/v\/([0-9a-z_\-]+)/i
      second_form_matches = second_form_video_id_regex.match(@url)
-    video_id = second_form_matches[1] if not (second_form_matches.nil? ||
-                                              second_form_matches.length < 2)
-    
-    return video_id
+    return second_form_matches[1] if not (second_form_matches.nil? ||
+                                          second_form_matches.length < 2)
+
+    # ...and the third.
+    third_form_video_id_regex = /\/([[:alnum:]]+)$/i
+    third_form_matches = third_form_video_id_regex.match(@url)
+    return third_form_matches[1] if not (third_form_matches.nil? ||
+                                         third_form_matches.length < 2)
+
+    # If we made it here, we couldn't figure out the video id. Yes,
+    # this is fatal, since we don't know where the video file is
+    # located.
+    raise StandardError.new("Could not parse the video id.")
    end
  
  
@@ -97,16 +135,46 @@ class Youtube < Website
  
      return t_parameter
    end
-
    
-  def get_page_data(url)
-    uri = URI.parse(url)
  
-    response = Net::HTTP.start(uri.host, uri.port) do |http|
-      http.get(uri.request_uri)
+  def get_available_formats(page_data)
+    # Parse the list of available formats from the "fmt_list" Flash
+    # variable.
+    available_formats = []
+    fmt_list_regex = /\"fmt_list\"\:[[:space:]]\"([^\"]+?)\"/
+    matches = fmt_list_regex.match(page_data)
+
+    if matches.nil?
+      raise StandardError.new("Could not find any valid formats.")
      end
  
-    return response.body
+    fmts_string = CGI::unescape(matches[1])
+    fmts_string.split(',').each do |fmt|
+      # Each "fmt" will look something like,
+      #
+      #   35/640000/9/0/115
+      #
+      # with the format identifier coming before the first slash.
+      first_slash_idx = fmt.index('/')
+      available_formats << fmt[0...first_slash_idx].to_i
+    end
+    
+    return available_formats
+  end
+
+
+  def get_desired_format(available_formats)
+    # Check for the presence of formats, in order of preference
+    # (quality). That is, we check for the best formats first. As soon
+    # as a format is found to be available, we return it as the
+    # desired format, since the first format we find is going to be
+    # the best available format.
+    return 37 if available_formats.include?(37)
+    return 22 if available_formats.include?(22)
+    return 35 if available_formats.include?(35)
+    return 18 if available_formats.include?(18)
+    return 34 if available_formats.include?(34)
+    return 17 if available_formats.include?(17)
    end
    
  end