Move all of the 'src' code under the more-standard 'lib'.

[dead/whatever-dl.git] / lib / whatever-dl / website.rb
diff --git a/lib/whatever-dl/website.rb b/lib/whatever-dl/website.rb

new file mode 100644 (file)

index 0000000..e9e65ca
--- /dev/null
+++ b/lib/whatever-dl/website.rb
@@ -0,0 +1,147 @@
+#
+# Copyright Michael Orlitzky
+#
+# http://michael.orlitzky.com/
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# http://www.fsf.org/licensing/licenses/gpl.html
+#
+
+# Needed for the default implementation of get_page_data.
+require 'net/http'
+
+# Necessary in a lot of subclasses; plus, we need it
+# to parse the server name out of our URL.
+require 'uri'
+
+# Needed to download.. things.
+require 'net/http'
+
+# This class keeps track of all its subclasses
+# We use this to loop through every "website" in an
+# attempt to determine to which site a URL belongs.
+class Website
+
+  protected;
+
+  @url = nil
+
+
+  def self.inherited(subclass)
+    if superclass.respond_to? :inherited
+      superclass.inherited(subclass)
+    end
+
+    # Every time we're subclassed, add the new
+    # subclass to our list of subclasses.
+    @subclasses ||= []
+    @subclasses << subclass
+  end
+
+
+  def server
+    # Get the HTTP server portion of our URI
+    uri = URI.parse(@url)
+    return uri.host
+  end
+
+
+
+  def get_page_data(url)
+    # A naive implementation that just grabs the
+    # data from a page.
+    uri = URI.parse(url)
+
+    response = Net::HTTP.start(uri.host, uri.port) do |http|
+      http.get(uri.request_uri, self.headers)
+    end
+
+    # Set the referer in case it is needed for some later request.
+    self.headers['Referer'] = uri.request_uri
+
+    return response.body
+  end
+
+
+
+  public;
+
+  # Additional headers used when requesting data from the website.
+  # These aren't passed as a parameter because the (final)
+  # downloaders need them as well.
+  attr_accessor :headers
+
+  def initialize(url)
+    @url = url
+    self.headers = { 'User-Agent' => Configuration::USER_AGENT }
+  end
+
+
+  def self.create(url)
+    # Factory method returning an instance of
+    # the appropriate subclass.
+
+    # While we're looping through the list of subclasses,
+    # we'll set this to the Generic class.
+    generic = nil
+
+    # Check the URL against each website's class.
+    # The class will know whether or not the URL
+    # "belongs" to its website.
+    @subclasses.each do |w|
+      if w.owns_url?(url)
+        if w.to_s == 'Generic'
+          generic = w
+        else
+          # We don't want to return Generic here because some
+          # other subclasses further down the list might match
+          # the URL.
+          return w.new(url)
+        end
+      end
+    end
+
+    # If nothing matched, try the generic parser.
+    return generic.new(url)
+  end
+
+
+  # Abstract definition. Each subclass of Website
+  # should support it on its own.
+  def self.owns_url?(url)
+    raise NotImplementedError
+  end
+
+
+  # Same here. Abstract.
+  def get_video_url()
+    raise NotImplementedError
+  end
+
+
+  # The website class should be responsible for determining the
+  # video's filename. By default, we can take the last component
+  # of the video URL, but in some cases, subclasses will want
+  # to override this behavior.
+  def get_video_filename()
+    # Use whatever comes after the final front slash.
+    file_and_params = get_video_url().split('/').pop()
+
+    # Unless it contains URL parameters. We don't want those.
+    return file_and_params unless file_and_params.include?('?')
+
+    # There must be some parameters. Strip them off.
+    param_start_idx = file_and_params.index('?')
+    return file_and_params[0...(param_start_idx)]
+  end
+
+end