Remove a number of fixtures whose tests no longer pass.

[dead/whatever-dl.git] / src / websites / youtube.rb
diff --git a/src/websites/youtube.rb b/src/websites/youtube.rb

index c2a37d603e59c1e210c52c0a33627b81df37c711..bdf2c2c4ba7bd28d25ecd1a236e8534142f8d2aa 100644 (file)
--- a/src/websites/youtube.rb
+++ b/src/websites/youtube.rb
@@ -49,22 +49,40 @@ class Youtube < Website
      page_data_url = "http://www.youtube.com/watch?v=#{video_id}"
      page_data = self.get_page_data(page_data_url)
  
-    # Get the URL map from the page.
-    fmt_url_map = get_format_url_map(page_data)
-
-    # Figure out which formats are available, and if any are,
-    # choose the best one.
-    available_formats = fmt_url_map.keys()
-    desired_format = get_desired_format(available_formats)
-
-    # First we cache the format so that when we're asked for the
-    # video filename later, we don't have to recompute the format.
-    @format = desired_format
-
-    # And then use whatever URL is available for the desired format.
-    # We assume that all available formats will have an entry in the
-    # fmt_url_map hash.
-    video_url = fmt_url_map[desired_format]
+    begin
+      # Get the URL map from the page.
+      fmt_url_map = get_format_url_map(page_data)
+
+      # Figure out which formats are available, and if any are,
+      # choose the best one.
+      available_formats = fmt_url_map.keys()
+      desired_format = get_desired_format(available_formats)
+
+      # First we cache the format so that when we're asked for the
+      # video filename later, we don't have to recompute the format.
+      @format = desired_format
+
+      # And then use whatever URL is available for the desired format.
+      # We assume that all available formats will have an entry in the
+      # fmt_url_map hash.
+      video_url = fmt_url_map[desired_format]
+      return video_url
+    rescue StandardError => e
+      # If at first you do not succeed, maybe someone decided to
+      # change some shit. This alternate method parses
+      # url_encoded_fmt_stream_map.
+      fmt_streams = get_fmt_stream_list(page_data)
+      video_url = self.choose_best_fmt_stream_url(fmt_streams)
+
+      # A duplicated "itag" parameter results in a 403.
+      itag_regex = /&itag=\d+/
+      matches = video_url.scan(itag_regex)
+
+      if matches.length > 1
+        # Get rid of the first occurrence.
+        video_url.sub!(itag_regex, '')
+      end
+    end
  
      return video_url
    end
@@ -90,6 +108,62 @@ class Youtube < Website
  
    protected;
  
+  def choose_best_fmt_stream_url(fmt_stream_urls)
+    # Take a list, generated by get_fmt_stream_list(), and choose the
+    # best URL out of the bunch based on the video format.
+    fmt_stream_urls.each do |fs|
+      if fs =~ /video\/mp4/ and fs =~ /quality=large/
+        return fs
+      elsif fs =~ /quality=large/
+        return fs
+      elsif fs =~ /video\/mp4/
+        return fs
+      else
+        return fs
+      end
+    end
+  end
+
+
+  def unicode_unescape(string)
+    # Unescape sequences like '\u0026'.
+    # Ok, only '\u0026' for now.
+    return string.gsub('\u0026', '&')
+  end
+
+
+  def get_fmt_stream_list(page_data)
+    # This is another (new?) method of embedding the video URLs.
+    # The url_encoded_fmt_stream_map variable contains a list of URLs
+    # in the form url=foo1,url=foo2...
+    #
+    # It looks like the first one in the list is the highest
+    # quality?  Let's just take that one for now.
+    fmt_stream_regex = /\"url_encoded_fmt_stream_map\": \"(.+?)\"/
+
+    matches = fmt_stream_regex.match(page_data)
+
+    if (matches.nil? || matches.length < 2)
+      raise StandardError.new("Could not parse the url_encoded_fmt_stream_map Flash variable.")
+    end
+
+    urlstring = matches[1]
+    urlstring.gsub!('url=', '')
+    urls = urlstring.split(',')
+
+    urls.each_index do |idx|
+      urls[idx] = self.unicode_unescape(urls[idx])
+      urls[idx] = CGI::unescape(urls[idx])
+      # Strip off everything after the first space in the URL.
+      # I don't know why this works, but if we leave the space
+      # in (encoded, even), Youtube throws us 403 errors.
+      urls[idx].gsub!(/ .+$/, '')
+    end
+
+    return urls
+  end
+
+
    # Get the video id from the URL. Should be relatively easy,
    # unless Youtube supports some URL formats of which I'm unaware.
    def parse_video_id()