# We use this to loop through every "website" in an
# attempt to determine to which site a URL belongs.
class Website
-
+
protected;
-
+
@url = nil
-
+
def self.inherited(subclass)
if superclass.respond_to? :inherited
superclass.inherited(subclass)
end
-
+
def get_page_data(url)
# A naive implementation that just grabs the
# data from a page.
uri = URI.parse(url)
response = Net::HTTP.start(uri.host, uri.port) do |http|
- http.get(uri.request_uri)
+ http.get(uri.request_uri, self.headers)
end
+ # Set the referer in case it is needed for some later request.
+ self.headers['Referer'] = uri.request_uri
+
return response.body
end
-
-
+
+
public;
+ # Additional headers used when requesting data from the website.
+ # These aren't passed as a parameter because the (final)
+ # downloaders need them as well.
+ attr_accessor :headers
+
def initialize(url)
@url = url
+ self.headers = { 'User-Agent' => Configuration::USER_AGENT }
end
-
+
def self.create(url)
# Factory method returning an instance of
# the appropriate subclass.
-
+
+ # While we're looping through the list of subclasses,
+ # we'll set this to the Generic class.
+ generic = nil
+
# Check the URL against each website's class.
# The class will know whether or not the URL
# "belongs" to its website.
@subclasses.each do |w|
if w.owns_url?(url)
- return w.new(url)
+ if w.to_s == 'Generic'
+ generic = w
+ else
+ # We don't want to return Generic here because some
+ # other subclasses further down the list might match
+ # the URL.
+ return w.new(url)
+ end
end
end
- # If nothing matched, we don't return an instance
- # of anything.
- return nil
+ # If nothing matched, try the generic parser.
+ return generic.new(url)
end
raise NotImplementedError
end
-
+
# Same here. Abstract.
def get_video_url()
raise NotImplementedError
end
-
+
# The website class should be responsible for determining the
# video's filename. By default, we can take the last component
# of the video URL, but in some cases, subclasses will want
# to override this behavior.
def get_video_filename()
- # Use whatever comes after the final front slash.
- return get_video_url().split('/').pop()
+ # Use whatever comes after the final front slash.
+ file_and_params = get_video_url().split('/').pop()
+
+ # Unless it contains URL parameters. We don't want those.
+ return file_and_params unless file_and_params.include?('?')
+
+ # There must be some parameters. Strip them off.
+ param_start_idx = file_and_params.index('?')
+ return file_and_params[0...(param_start_idx)]
end
-
+
end