X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=lib%2Fwhatever-dl%2Fwebsite.rb;fp=lib%2Fwhatever-dl%2Fwebsite.rb;h=e9e65ca1909f7add3c8b2c6ecc800a3856145528;hb=6de408333ceb0d142f8fa0fef2571228e89c8fc1;hp=0000000000000000000000000000000000000000;hpb=8e886df259246365023322b78f58e4037cb536a4;p=dead%2Fwhatever-dl.git diff --git a/lib/whatever-dl/website.rb b/lib/whatever-dl/website.rb new file mode 100644 index 0000000..e9e65ca --- /dev/null +++ b/lib/whatever-dl/website.rb @@ -0,0 +1,147 @@ +# +# Copyright Michael Orlitzky +# +# http://michael.orlitzky.com/ +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# http://www.fsf.org/licensing/licenses/gpl.html +# + +# Needed for the default implementation of get_page_data. +require 'net/http' + +# Necessary in a lot of subclasses; plus, we need it +# to parse the server name out of our URL. +require 'uri' + +# Needed to download.. things. +require 'net/http' + +# This class keeps track of all its subclasses +# We use this to loop through every "website" in an +# attempt to determine to which site a URL belongs. +class Website + + protected; + + @url = nil + + + def self.inherited(subclass) + if superclass.respond_to? :inherited + superclass.inherited(subclass) + end + + # Every time we're subclassed, add the new + # subclass to our list of subclasses. + @subclasses ||= [] + @subclasses << subclass + end + + + def server + # Get the HTTP server portion of our URI + uri = URI.parse(@url) + return uri.host + end + + + + def get_page_data(url) + # A naive implementation that just grabs the + # data from a page. + uri = URI.parse(url) + + response = Net::HTTP.start(uri.host, uri.port) do |http| + http.get(uri.request_uri, self.headers) + end + + # Set the referer in case it is needed for some later request. + self.headers['Referer'] = uri.request_uri + + return response.body + end + + + + public; + + # Additional headers used when requesting data from the website. + # These aren't passed as a parameter because the (final) + # downloaders need them as well. + attr_accessor :headers + + def initialize(url) + @url = url + self.headers = { 'User-Agent' => Configuration::USER_AGENT } + end + + + def self.create(url) + # Factory method returning an instance of + # the appropriate subclass. + + # While we're looping through the list of subclasses, + # we'll set this to the Generic class. + generic = nil + + # Check the URL against each website's class. + # The class will know whether or not the URL + # "belongs" to its website. + @subclasses.each do |w| + if w.owns_url?(url) + if w.to_s == 'Generic' + generic = w + else + # We don't want to return Generic here because some + # other subclasses further down the list might match + # the URL. + return w.new(url) + end + end + end + + # If nothing matched, try the generic parser. + return generic.new(url) + end + + + # Abstract definition. Each subclass of Website + # should support it on its own. + def self.owns_url?(url) + raise NotImplementedError + end + + + # Same here. Abstract. + def get_video_url() + raise NotImplementedError + end + + + # The website class should be responsible for determining the + # video's filename. By default, we can take the last component + # of the video URL, but in some cases, subclasses will want + # to override this behavior. + def get_video_filename() + # Use whatever comes after the final front slash. + file_and_params = get_video_url().split('/').pop() + + # Unless it contains URL parameters. We don't want those. + return file_and_params unless file_and_params.include?('?') + + # There must be some parameters. Strip them off. + param_start_idx = file_and_params.index('?') + return file_and_params[0...(param_start_idx)] + end + +end