]> gitweb.michael.orlitzky.com - dead/whatever-dl.git/blob - src/website.rb
Add a generic parser that will hopefully supplant some site-specific subclasses.
[dead/whatever-dl.git] / src / website.rb
1 #
2 # Copyright Michael Orlitzky
3 #
4 # http://michael.orlitzky.com/
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # http://www.fsf.org/licensing/licenses/gpl.html
17 #
18
19 # Needed for the default implementation of get_page_data.
20 require 'net/http'
21
22 # Necessary in a lot of subclasses; plus, we need it
23 # to parse the server name out of our URL.
24 require 'uri'
25
26 # Needed to download.. things.
27 require 'net/http'
28
29 # This class keeps track of all its subclasses
30 # We use this to loop through every "website" in an
31 # attempt to determine to which site a URL belongs.
32 class Website
33
34 protected;
35
36 @url = nil
37
38
39 def self.inherited(subclass)
40 if superclass.respond_to? :inherited
41 superclass.inherited(subclass)
42 end
43
44 # Every time we're subclassed, add the new
45 # subclass to our list of subclasses.
46 @subclasses ||= []
47 @subclasses << subclass
48 end
49
50
51 def server
52 # Get the HTTP server portion of our URI
53 uri = URI.parse(@url)
54 return uri.host
55 end
56
57
58
59 def get_page_data(url)
60 # A naive implementation that just grabs the
61 # data from a page.
62 uri = URI.parse(url)
63
64 response = Net::HTTP.start(uri.host, uri.port) do |http|
65 http.get(uri.request_uri, self.headers)
66 end
67
68 # Set the referer in case it is needed for some later request.
69 self.headers['Referer'] = uri.request_uri
70
71 return response.body
72 end
73
74
75
76 public;
77
78 # Additional headers used when requesting data from the website.
79 # These aren't passed as a parameter because the (final)
80 # downloaders need them as well.
81 attr_accessor :headers
82
83 def initialize(url)
84 @url = url
85 self.headers = { 'User-Agent' => Configuration::USER_AGENT }
86 end
87
88
89 def self.create(url)
90 # Factory method returning an instance of
91 # the appropriate subclass.
92
93 # While we're looping through the list of subclasses,
94 # we'll set this to the Generic class.
95 generic = nil
96
97 # Check the URL against each website's class.
98 # The class will know whether or not the URL
99 # "belongs" to its website.
100 @subclasses.each do |w|
101 if w.owns_url?(url)
102 if w.to_s == 'Generic'
103 generic = w
104 else
105 # We don't want to return Generic here because some
106 # other subclasses further down the list might match
107 # the URL.
108 return w.new(url)
109 end
110 end
111 end
112
113 # If nothing matched, try the generic parser.
114 return generic.new(url)
115 end
116
117
118 # Abstract definition. Each subclass of Website
119 # should support it on its own.
120 def self.owns_url?(url)
121 raise NotImplementedError
122 end
123
124
125 # Same here. Abstract.
126 def get_video_url()
127 raise NotImplementedError
128 end
129
130
131 # The website class should be responsible for determining the
132 # video's filename. By default, we can take the last component
133 # of the video URL, but in some cases, subclasses will want
134 # to override this behavior.
135 def get_video_filename()
136 # Use whatever comes after the final front slash.
137 file_and_params = get_video_url().split('/').pop()
138
139 # Unless it contains URL parameters. We don't want those.
140 return file_and_params unless file_and_params.include?('?')
141
142 # There must be some parameters. Strip them off.
143 param_start_idx = file_and_params.index('?')
144 return file_and_params[0...(param_start_idx)]
145 end
146
147 end