X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=src%2Fwebsite.rb;h=4a3f2afe36a0c009a095202e9cb2eb7e405a2395;hb=e91a9668c7be92d33a29d3645590195aaa1a3daa;hp=b5a501f9db8184d403d93ee1c13a34900b10506d;hpb=1d43361a1d8c6fc3938a2438baa8d8348129b4fd;p=dead%2Fwhatever-dl.git diff --git a/src/website.rb b/src/website.rb index b5a501f..4a3f2af 100644 --- a/src/website.rb +++ b/src/website.rb @@ -16,20 +16,26 @@ # http://www.fsf.org/licensing/licenses/gpl.html # +# Needed for the default implementation of get_page_data. +require 'net/http' + # Necessary in a lot of subclasses; plus, we need it # to parse the server name out of our URL. require 'uri' +# Needed to download.. things. +require 'net/http' + # This class keeps track of all its subclasses # We use this to loop through every "website" in an # attempt to determine to which site a URL belongs. class Website - + protected; - + @url = nil - + def self.inherited(subclass) if superclass.respond_to? :inherited superclass.inherited(subclass) @@ -48,18 +54,42 @@ class Website return uri.host end - + + + def get_page_data(url) + # A naive implementation that just grabs the + # data from a page. + uri = URI.parse(url) + + response = Net::HTTP.start(uri.host, uri.port) do |http| + http.get(uri.request_uri, self.headers) + end + + # Set the referer in case it is needed for some later request. + self.headers['Referer'] = uri.request_uri + + return response.body + end + + + public; + # Additional headers used when requesting data from the website. + # These aren't passed as a parameter because the (final) + # downloaders need them as well. + attr_accessor :headers + def initialize(url) @url = url + self.headers = { 'User-Agent' => Configuration::USER_AGENT } end - + def self.create(url) # Factory method returning an instance of # the appropriate subclass. - + # Check the URL against each website's class. # The class will know whether or not the URL # "belongs" to its website. @@ -81,20 +111,27 @@ class Website raise NotImplementedError end - + # Same here. Abstract. def get_video_url() raise NotImplementedError end - + # The website class should be responsible for determining the # video's filename. By default, we can take the last component # of the video URL, but in some cases, subclasses will want # to override this behavior. def get_video_filename() - # Use whatever comes after the final front slash. - return get_video_url().split('/').pop() + # Use whatever comes after the final front slash. + file_and_params = get_video_url().split('/').pop() + + # Unless it contains URL parameters. We don't want those. + return file_and_params unless file_and_params.include?('?') + + # There must be some parameters. Strip them off. + param_start_idx = file_and_params.index('?') + return file_and_params[0...(param_start_idx)] end - + end