]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
24997154eeb84951188d804b6336647c8477a61f
[dead/lwn-epub.git] / src / LWN / Page.hs
1 module LWN.Page
2 where
3
4 import Text.Pandoc
5 import Data.Time (getCurrentTime)
6 import System.IO (Handle)
7 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
8 import Data.String.Utils (split, strip)
9 import Data.Maybe (catMaybes, fromJust, isNothing)
10 import Data.Tree.NTree.TypeDefs (NTree)
11 import Test.HUnit (Assertion, assertEqual)
12 import Test.Framework (Test, testGroup)
13 import Test.Framework.Providers.HUnit (testCase)
14 import Text.XML.HXT.Core (
15 ArrowXml,
16 IOSArrow,
17 XmlTree,
18 XNode,
19 (>>>),
20 (/>),
21 (//>),
22 getChildren,
23 getText,
24 hasAttrValue,
25 hasName,
26 mkName,
27 none,
28 processAttrl,
29 processTopDown,
30 runX,
31 setElemName,
32 xshow,
33 when
34 )
35 import Text.HandsomeSoup (css, parseHtml)
36
37 import LWN.Article
38 import XHTML
39
40 data Page =
41 -- | An LWN page with one article on it.
42 ArticlePage { article :: Article } |
43
44 -- | An LWN page with more than one article on it. These require
45 -- different parsing and display functions than the single-article
46 -- pages.
47 FullPage { headline :: String,
48 articles :: [Article] }
49
50
51 instance XHTML Page where
52 to_xhtml (ArticlePage a) =
53 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
54 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
55 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
56 "<head>" ++
57 " <meta http-equiv=\"Content-Type\"" ++
58 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
59 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
60 "</head>" ++
61 "<body>" ++
62 "<div>" ++
63 (to_xhtml a) ++
64 "</div>" ++
65 "</body>" ++
66 "</html>"
67
68 to_xhtml (FullPage hl as) =
69 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
70 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
71 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
72 "<head>" ++
73 " <meta http-equiv=\"Content-Type\"" ++
74 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
75 " <title>" ++ hl ++ "</title>" ++
76 "</head>" ++
77 "<body>" ++
78 "<div>" ++
79 "<h1>" ++ hl ++ "</h1>" ++
80 (concatMap to_xhtml as) ++
81 "</div>" ++
82 "</body>" ++
83 "</html>"
84
85
86
87 remove_images :: (ArrowXml a) => a XmlTree XmlTree
88 remove_images =
89 processTopDown ((none) `when` is_image)
90
91
92 is_link :: (ArrowXml a) => a XmlTree XmlTree
93 is_link =
94 hasName "a"
95
96 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
97 replace_links_with_spans =
98 processTopDown $ (make_span >>> remove_attrs) `when` is_link
99 where
100 make_span = setElemName $ mkName "span"
101 remove_attrs = processAttrl none
102
103 -- | Preprocessing common to both page types.
104 preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
105 preprocess xml =
106 xml >>> remove_images >>> replace_links_with_spans
107
108
109 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
110 parse xml = do
111 let clean_xml = preprocess xml
112 appr <- ap_parse clean_xml
113 fppr <- fp_parse clean_xml
114 return $
115 if (isNothing appr) then
116 fppr
117 else
118 appr
119
120 --
121 -- ArticlePage Stuff
122 --
123 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
124 ap_parse xml = do
125 arts <- ap_parse_articles xml
126 case arts of
127 Just [x] -> return $ Just $ ArticlePage x
128 _ -> return Nothing
129
130
131 ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
132 ap_parse_headline xml = do
133 let element_filter = xml >>> css "div.PageHeadline h1"
134 let element_text_filter = element_filter /> getText
135 element_text <- runX element_text_filter
136 return $ case element_text of
137 [x] -> Just $ strip x
138 [] -> Nothing
139 _ -> error "Found more than one headline."
140
141 ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
142 ap_parse_byline xml = do
143 let element_filter = xml >>> css "div.Byline"
144 let element_text_filter = element_filter /> getText
145 element_text <- runX element_text_filter
146 return $ case element_text of
147 [x] -> Just $ strip x
148 [] -> Nothing
149 _ -> error "Found more than one byline."
150
151
152 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
153 ap_parse_body xml = do
154 let element_filter = xml >>> css "div.ArticleText"
155 let element_html_filter = xshow element_filter
156 element_html <- runX element_html_filter
157 return $ case element_html of
158 [x] -> Just x
159 [] -> Nothing
160 _ -> error "Found more than one article."
161
162
163 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
164 ap_parse_articles xml = do
165 parsed_headline <- ap_parse_headline xml
166 parsed_byline <- ap_parse_byline xml
167 parsed_body <- ap_parse_body xml
168 let title' = Title (fromJust parsed_headline)
169 let byline' = Byline parsed_byline
170 let body' = BodyHtml (fromJust parsed_body)
171 return $ Just $ [Article title' byline' body']
172
173
174
175
176 --
177 -- FullPage Stuff
178 --
179
180 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
181 fp_parse xml = do
182 hl <- fp_parse_headline xml
183 parsed_articles <- fp_parse_articles xml
184 case parsed_articles of
185 them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
186 _ -> return Nothing
187
188
189
190
191 fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
192 fp_parse_headline xml = do
193 let element_filter = xml >>> css "div.PageHeadline h1"
194 let element_text_filter = element_filter /> getText
195 element_text <- runX element_text_filter
196 return $ case element_text of
197 [x] -> Just $ strip x
198 [] -> Nothing
199 _ -> error "Found more than one headline."
200
201 fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
202 fp_parse_article_byline xml = do
203 let element_filter = xml >>> css "div.FeatureByLine"
204 let element_text_filter = element_filter /> getText
205 element_text <- runX element_text_filter
206 return $ case element_text of
207 [x] -> Just $ strip x
208 [] -> Nothing
209 _ -> error "Found more than one article byline."
210
211
212 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
213 fp_parse_article_title xml = do
214 let element_filter = xml >>> css "h2.SummaryHL"
215 let element_text_filter = element_filter //> getText
216 element_text <- runX element_text_filter
217 return $ case element_text of
218 [x] -> Just $ strip x
219 [] -> Nothing
220 _ -> error "Found more than one article title."
221
222
223
224 is_title :: (ArrowXml a) => a XmlTree XmlTree
225 is_title =
226 (hasName "h2")
227 >>>
228 (hasAttrValue "class" (== "SummaryHL"))
229
230
231 is_byline :: (ArrowXml a) => a XmlTree XmlTree
232 is_byline =
233 (hasName "div")
234 >>>
235 (hasAttrValue "class" (== "FeatureByLine"))
236
237
238 is_image :: (ArrowXml a) => a XmlTree XmlTree
239 is_image =
240 hasName "img"
241
242 remove_title :: (ArrowXml a) => a XmlTree XmlTree
243 remove_title =
244 processTopDown ((none) `when` is_title)
245
246
247 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
248 remove_byline =
249 processTopDown ((none) `when` is_byline)
250
251
252
253 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
254 fp_parse_article_body xml = do
255 -- First, delete the article title and byline.
256 let clean_xml' = xml >>> remove_title >>> remove_byline
257 -- The only child of the body element should be a div.lwn-article
258 -- since we wrapped the article's HTML in that.
259 let clean_xml = clean_xml' >>> css "body" >>> getChildren
260 clean_html <- runX . xshow $ clean_xml
261 return $ case clean_html of
262 [x] -> Just x
263 [] -> Nothing
264 _ -> error "Found more than one article body."
265
266 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
267 fp_parse_article xml = do
268 parsed_article_title <- fp_parse_article_title xml
269 parsed_article_byline <- fp_parse_article_byline xml
270 parsed_article_body <- fp_parse_article_body xml
271 let title' = Title $ fromJust parsed_article_title
272 let byline' = Byline parsed_article_byline
273 let body' = BodyHtml $ fromJust parsed_article_body
274 return $ Just $ Article title' byline' body'
275
276 parse_html_article :: String -> IO (Maybe Article)
277 parse_html_article html = do
278 let xml = parseHtml $ wrap_in_body_div html
279 fp_parse_article xml
280
281
282 -- | In the full page, all of the article titles and bodies are
283 -- wrapped in a div.ArticleText.
284 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
285 parse_bodies xml =
286 xml >>> css "div.ArticleText"
287
288
289 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
290 fp_parse_articles xml = do
291 bodies <- runX . xshow $ parse_bodies xml
292 let article_separator = "<h2 class=\"SummaryHL\">"
293 let split_articles'' = split article_separator (concat bodies)
294 -- The first element will contain the crap before the first <h2...>.
295 let split_articles' = tail split_articles''
296 -- Put the separator back, it was lost during the split.
297 let split_articles = map (article_separator ++) split_articles'
298 --_ <- mapM print_article split_articles
299 real_articles <- mapM parse_html_article split_articles
300 let just_articles = catMaybes real_articles
301 return just_articles
302
303
304 wrap_in_body_div :: String -> String
305 wrap_in_body_div s =
306 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
307
308
309
310
311 --
312 -- Epublishable stuff
313 --
314
315 title :: Page -> String
316 title (ArticlePage a) = getTitle $ LWN.Article.title a
317 title (FullPage hl _) = hl
318
319
320 metadata :: Page -> IO String
321 metadata obj = do
322 date <- getCurrentTime
323 return $
324 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
325 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
326 "<dc:language>en-US</dc:language>\n" ++
327 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
328 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
329
330
331 epublish :: Page -> Handle -> IO ()
332 epublish obj handle = do
333 let xhtml = to_xhtml obj
334 epmd <- metadata obj
335 epub <- xhtml_to_epub epmd xhtml
336 B.hPut handle epub
337
338
339 xhtml_to_epub :: String -> String -> IO B.ByteString
340 xhtml_to_epub epmd =
341 write_epub . read_html
342 where
343 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
344 write_epub = writeEPUB Nothing [] my_writer_options
345 read_html = readHtml defaultParserState
346
347
348
349
350 test_preprocess_links :: Assertion
351 test_preprocess_links = do
352 actual_xml' <- runX $ (preprocess input_xml) >>> css "body"
353 let actual_xml = actual_xml' !! 0
354
355 expected_xml' <- runX $ expected_xml'' >>> css "body"
356 let expected_xml = expected_xml' !! 0
357
358 assertEqual
359 "Links are replaced with spans"
360 expected_xml
361 actual_xml
362 where
363 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
364 input_xml = parseHtml input_html
365 expected_html = "<body><span>Hello, world!</span></body>"
366 expected_xml'' = parseHtml expected_html
367
368 page_tests :: Test
369 page_tests =
370 testGroup "Page Tests" [
371 testCase "Links are replaced with spans" test_preprocess_links ]