]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
Combine ArticlePage and FullPage into one Page type.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 module LWN.Page
2 where
3
4 import Text.Pandoc
5 import Data.Time (getCurrentTime)
6 import System.IO (Handle)
7 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
8 import Data.String.Utils (split, strip)
9 import Data.Maybe (catMaybes, fromJust, isNothing)
10 import Data.Tree.NTree.TypeDefs (NTree)
11 import Text.XML.HXT.Core (
12 ArrowXml,
13 IOSArrow,
14 XmlTree,
15 XNode,
16 (>>>),
17 (/>),
18 (//>),
19 getChildren,
20 getText,
21 hasAttrValue,
22 hasName,
23 none,
24 processTopDown,
25 runX,
26 xshow,
27 when
28 )
29 import Text.HandsomeSoup (css, parseHtml)
30
31 import LWN.Article
32 import XHTML
33
34 data Page =
35 -- | An LWN page with one article on it.
36 ArticlePage { article :: Article } |
37
38 -- | An LWN page with more than one article on it. These require
39 -- different parsing and display functions than the single-article
40 -- pages.
41 FullPage { headline :: String,
42 articles :: [Article] }
43
44
45 instance XHTML Page where
46 to_xhtml (ArticlePage a) =
47 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
48 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
49 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
50 "<head>" ++
51 " <meta http-equiv=\"Content-Type\"" ++
52 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
53 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
54 "</head>" ++
55 "<body>" ++
56 "<div>" ++
57 (to_xhtml a) ++
58 "</div>" ++
59 "</body>" ++
60 "</html>"
61
62 to_xhtml (FullPage hl as) =
63 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
64 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
65 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
66 "<head>" ++
67 " <meta http-equiv=\"Content-Type\"" ++
68 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
69 " <title>" ++ hl ++ "</title>" ++
70 "</head>" ++
71 "<body>" ++
72 "<div>" ++
73 "<h1>" ++ hl ++ "</h1>" ++
74 (concatMap to_xhtml as) ++
75 "</div>" ++
76 "</body>" ++
77 "</html>"
78
79
80
81 remove_images :: (ArrowXml a) => a XmlTree XmlTree
82 remove_images =
83 processTopDown ((none) `when` is_image)
84
85
86 -- | Preprocessing common to both page types.
87 preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
88 preprocess xml =
89 xml >>>remove_images
90
91
92 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
93 parse xml = do
94 let clean_xml = preprocess xml
95 appr <- ap_parse clean_xml
96 fppr <- fp_parse clean_xml
97 return $
98 if (isNothing appr) then
99 fppr
100 else
101 appr
102
103 --
104 -- ArticlePage Stuff
105 --
106 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
107 ap_parse xml = do
108 arts <- ap_parse_articles xml
109 case arts of
110 Just [x] -> return $ Just $ ArticlePage x
111 _ -> return Nothing
112
113
114 ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
115 ap_parse_headline xml = do
116 let element_filter = xml >>> css "div.PageHeadline h1"
117 let element_text_filter = element_filter /> getText
118 element_text <- runX element_text_filter
119 return $ case element_text of
120 [x] -> Just $ strip x
121 [] -> Nothing
122 _ -> error "Found more than one headline."
123
124 ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
125 ap_parse_byline xml = do
126 let element_filter = xml >>> css "div.Byline"
127 let element_text_filter = element_filter /> getText
128 element_text <- runX element_text_filter
129 return $ case element_text of
130 [x] -> Just $ strip x
131 [] -> Nothing
132 _ -> error "Found more than one byline."
133
134
135 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
136 ap_parse_body xml = do
137 let element_filter = xml >>> css "div.ArticleText"
138 let element_html_filter = xshow element_filter
139 element_html <- runX element_html_filter
140 return $ case element_html of
141 [x] -> Just x
142 [] -> Nothing
143 _ -> error "Found more than one article."
144
145
146 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
147 ap_parse_articles xml = do
148 parsed_headline <- ap_parse_headline xml
149 parsed_byline <- ap_parse_byline xml
150 parsed_body <- ap_parse_body xml
151 let title' = Title (fromJust parsed_headline)
152 let byline' = Byline parsed_byline
153 let body' = BodyHtml (fromJust parsed_body)
154 return $ Just $ [Article title' byline' body']
155
156
157
158
159 --
160 -- FullPage Stuff
161 --
162
163 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
164 fp_parse xml = do
165 hl <- fp_parse_headline xml
166 parsed_articles <- fp_parse_articles xml
167 case parsed_articles of
168 them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
169 _ -> return Nothing
170
171
172
173
174 fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
175 fp_parse_headline xml = do
176 let element_filter = xml >>> css "div.PageHeadline h1"
177 let element_text_filter = element_filter /> getText
178 element_text <- runX element_text_filter
179 return $ case element_text of
180 [x] -> Just $ strip x
181 [] -> Nothing
182 _ -> error "Found more than one headline."
183
184 fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
185 fp_parse_article_byline xml = do
186 let element_filter = xml >>> css "div.FeatureByLine"
187 let element_text_filter = element_filter /> getText
188 element_text <- runX element_text_filter
189 return $ case element_text of
190 [x] -> Just $ strip x
191 [] -> Nothing
192 _ -> error "Found more than one article byline."
193
194
195 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
196 fp_parse_article_title xml = do
197 let element_filter = xml >>> css "h2.SummaryHL"
198 let element_text_filter = element_filter //> getText
199 element_text <- runX element_text_filter
200 return $ case element_text of
201 [x] -> Just $ strip x
202 [] -> Nothing
203 _ -> error "Found more than one article title."
204
205
206
207 is_title :: (ArrowXml a) => a XmlTree XmlTree
208 is_title =
209 (hasName "h2")
210 >>>
211 (hasAttrValue "class" (== "SummaryHL"))
212
213
214 is_byline :: (ArrowXml a) => a XmlTree XmlTree
215 is_byline =
216 (hasName "div")
217 >>>
218 (hasAttrValue "class" (== "FeatureByLine"))
219
220
221 is_image :: (ArrowXml a) => a XmlTree XmlTree
222 is_image =
223 hasName "img"
224
225
226 remove_title :: (ArrowXml a) => a XmlTree XmlTree
227 remove_title =
228 processTopDown ((none) `when` is_title)
229
230
231 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
232 remove_byline =
233 processTopDown ((none) `when` is_byline)
234
235
236
237 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
238 fp_parse_article_body xml = do
239 -- First, delete the article title and byline.
240 let clean_xml' = xml >>> remove_title >>> remove_byline
241 -- The only child of the body element should be a div.lwn-article
242 -- since we wrapped the article's HTML in that.
243 let clean_xml = clean_xml' >>> css "body" >>> getChildren
244 clean_html <- runX . xshow $ clean_xml
245 return $ case clean_html of
246 [x] -> Just x
247 [] -> Nothing
248 _ -> error "Found more than one article body."
249
250 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
251 fp_parse_article xml = do
252 parsed_article_title <- fp_parse_article_title xml
253 parsed_article_byline <- fp_parse_article_byline xml
254 parsed_article_body <- fp_parse_article_body xml
255 let title' = Title $ fromJust parsed_article_title
256 let byline' = Byline parsed_article_byline
257 let body' = BodyHtml $ fromJust parsed_article_body
258 return $ Just $ Article title' byline' body'
259
260 parse_html_article :: String -> IO (Maybe Article)
261 parse_html_article html = do
262 let xml = parseHtml $ wrap_in_body_div html
263 fp_parse_article xml
264
265
266 -- | In the full page, all of the article titles and bodies are
267 -- wrapped in a div.ArticleText.
268 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
269 parse_bodies xml =
270 xml >>> css "div.ArticleText"
271
272
273 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
274 fp_parse_articles xml = do
275 bodies <- runX . xshow $ parse_bodies xml
276 let article_separator = "<h2 class=\"SummaryHL\">"
277 let split_articles'' = split article_separator (concat bodies)
278 -- The first element will contain the crap before the first <h2...>.
279 let split_articles' = tail split_articles''
280 -- Put the separator back, it was lost during the split.
281 let split_articles = map (article_separator ++) split_articles'
282 --_ <- mapM print_article split_articles
283 real_articles <- mapM parse_html_article split_articles
284 let just_articles = catMaybes real_articles
285 return just_articles
286
287
288 wrap_in_body_div :: String -> String
289 wrap_in_body_div s =
290 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
291
292
293
294
295 --
296 -- Epublishable stuff
297 --
298
299 title :: Page -> String
300 title (ArticlePage a) = getTitle $ LWN.Article.title a
301 title (FullPage hl _) = hl
302
303
304 metadata :: Page -> IO String
305 metadata obj = do
306 date <- getCurrentTime
307 return $
308 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
309 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
310 "<dc:language>en-US</dc:language>\n" ++
311 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
312 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
313
314
315 epublish :: Page -> Handle -> IO ()
316 epublish obj handle = do
317 let xhtml = to_xhtml obj
318 epmd <- metadata obj
319 epub <- xhtml_to_epub epmd xhtml
320 B.hPut handle epub
321
322
323 xhtml_to_epub :: String -> String -> IO B.ByteString
324 xhtml_to_epub epmd =
325 write_epub . read_html
326 where
327 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
328 write_epub = writeEPUB Nothing [] my_writer_options
329 read_html = readHtml defaultParserState