src/website.rb

   1 #
   2 # Copyright Michael Orlitzky
   3 #
   4 # http://michael.orlitzky.com/
   5 #
   6 # This program is free software: you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 # GNU General Public License for more details.
  15 #
  16 # http://www.fsf.org/licensing/licenses/gpl.html
  17 #
  18
  19 # Needed for the default implementation of get_page_data.
  20 require 'net/http'
  21
  22 # Necessary in a lot of subclasses; plus, we need it
  23 # to parse the server name out of our URL.
  24 require 'uri'
  25
  26 # Needed to download.. things.
  27 require 'net/http'
  28
  29 # This class keeps track of all its subclasses
  30 # We use this to loop through every "website" in an
  31 # attempt to determine to which site a URL belongs.
  32 class Website
  33
  34   protected;
  35
  36   @url = nil
  37
  38
  39   def self.inherited(subclass)
  40     if superclass.respond_to? :inherited
  41       superclass.inherited(subclass)
  42     end
  43
  44     # Every time we're subclassed, add the new
  45     # subclass to our list of subclasses.
  46     @subclasses ||= []
  47     @subclasses << subclass
  48   end
  49
  50
  51   def server
  52     # Get the HTTP server portion of our URI
  53     uri = URI.parse(@url)
  54     return uri.host
  55   end
  56
  57
  58
  59   def get_page_data(url)
  60     # A naive implementation that just grabs the
  61     # data from a page.
  62     uri = URI.parse(url)
  63
  64     response = Net::HTTP.start(uri.host, uri.port) do |http|
  65       http.get(uri.request_uri, self.headers)
  66     end
  67
  68     # Set the referer in case it is needed for some later request.
  69     self.headers['Referer'] = uri.request_uri
  70
  71     return response.body
  72   end
  73
  74
  75
  76   public;
  77
  78   # Additional headers used when requesting data from the website.
  79   # These aren't passed as a parameter because the (final)
  80   # downloaders need them as well.
  81   attr_accessor :headers
  82
  83   def initialize(url)
  84     @url = url
  85     self.headers = { 'User-Agent' => Configuration::USER_AGENT }
  86   end
  87
  88
  89   def self.create(url)
  90     # Factory method returning an instance of
  91     # the appropriate subclass.
  92
  93     # While we're looping through the list of subclasses,
  94     # we'll set this to the Generic class.
  95     generic = nil
  96
  97     # Check the URL against each website's class.
  98     # The class will know whether or not the URL
  99     # "belongs" to its website.
 100     @subclasses.each do |w|
 101       if w.owns_url?(url)
 102         if w.to_s == 'Generic'
 103           generic = w
 104         else
 105           # We don't want to return Generic here because some
 106           # other subclasses further down the list might match
 107           # the URL.
 108           return w.new(url)
 109         end
 110       end
 111     end
 112
 113     # If nothing matched, try the generic parser.
 114     return generic.new(url)
 115   end
 116
 117
 118   # Abstract definition. Each subclass of Website
 119   # should support it on its own.
 120   def self.owns_url?(url)
 121     raise NotImplementedError
 122   end
 123
 124
 125   # Same here. Abstract.
 126   def get_video_url()
 127     raise NotImplementedError
 128   end
 129
 130
 131   # The website class should be responsible for determining the
 132   # video's filename. By default, we can take the last component
 133   # of the video URL, but in some cases, subclasses will want
 134   # to override this behavior.
 135   def get_video_filename()
 136     # Use whatever comes after the final front slash.
 137     file_and_params = get_video_url().split('/').pop()
 138
 139     # Unless it contains URL parameters. We don't want those.
 140     return file_and_params unless file_and_params.include?('?')
 141
 142     # There must be some parameters. Strip them off.
 143     param_start_idx = file_and_params.index('?')
 144     return file_and_params[0...(param_start_idx)]
 145   end
 146
 147 end