Source reorganization and cleanup.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import qualified Data.Map as Map (lookup)
7 import Data.Time (getCurrentTime)
8 import System.IO (Handle, hClose, hFlush)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import Data.Maybe (catMaybes, fromJust, isNothing)
12 import Test.HUnit (Assertion, assertEqual)
13 import Test.Framework (Test, testGroup)
14 import Test.Framework.Providers.HUnit (testCase)
15 import Text.Pandoc (
16 defaultParserState,
17 defaultWriterOptions,
18 readHtml,
19 writeEPUB,
20 writerEPUBMetadata)
21 import Text.XML.HXT.Core (
22 ArrowXml,
23 IOSArrow,
24 XmlTree,
25 (>>>),
26 (/>),
27 (//>),
28 changeAttrValue,
29 getAttrValue,
30 getChildren,
31 getText,
32 hasAttrValue,
33 hasName,
34 isElem,
35 mkName,
36 none,
37 processAttrl,
38 processTopDown,
39 runX,
40 setElemName,
41 xshow,
42 when)
43 import Text.HandsomeSoup (css, parseHtml)
44
45 import LWN.Article
46 import LWN.HTTP (ImageMap, download_image_urls)
47 import LWN.URI (URL, try_make_absolute_url)
48 import LWN.XHTML (XHTML, to_xhtml)
49 import Misc (contains)
50
51 -- Should be called *after* preprocessing.
52 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
53 download_images xml = do
54 image_urls <- runX $ xml >>> image_srcs
55 download_image_urls image_urls
56
57
58 data Page =
59 -- | An LWN page with one article on it.
60 ArticlePage { article :: Article } |
61
62 -- | An LWN page with more than one article on it. These require
63 -- different parsing and display functions than the single-article
64 -- pages.
65 FullPage { headline :: String,
66 articles :: [Article] }
67
68
69 instance XHTML Page where
70 to_xhtml (ArticlePage a) =
71 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
72 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
73 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
74 "<head>" ++
75 " <meta http-equiv=\"Content-Type\"" ++
76 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
77 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
78 "</head>" ++
79 "<body>" ++
80 "<div>" ++
81 (to_xhtml a) ++
82 "</div>" ++
83 "</body>" ++
84 "</html>"
85
86 to_xhtml (FullPage hl as) =
87 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
88 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
89 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
90 "<head>" ++
91 " <meta http-equiv=\"Content-Type\"" ++
92 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
93 " <title>" ++ hl ++ "</title>" ++
94 "</head>" ++
95 "<body>" ++
96 "<div>" ++
97 "<h1>" ++ hl ++ "</h1>" ++
98 (concatMap to_xhtml as) ++
99 "</div>" ++
100 "</body>" ++
101 "</html>"
102
103
104
105 is_link :: (ArrowXml a) => a XmlTree XmlTree
106 is_link =
107 isElem >>> hasName "a"
108
109
110 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
111 remove_comment_links =
112 processTopDown $ kill_comments `when` is_link
113 where
114 is_comment_link =
115 hasAttrValue "href" (contains "#Comments")
116
117 kill_comments =
118 none `when` is_comment_link
119
120 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
121 replace_links_with_spans =
122 processTopDown $ (make_span >>> remove_attrs) `when` is_link
123 where
124 make_span = setElemName $ mkName "span"
125 remove_attrs = processAttrl none
126
127
128 -- | Preprocessing common to both page types.
129 preprocess :: (ArrowXml a) => a XmlTree XmlTree
130 preprocess =
131 make_image_srcs_absolute
132 >>>
133 remove_comment_links
134 >>>
135 replace_links_with_spans
136
137
138 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
139 replace_remote_img_srcs image_map =
140 processTopDown (make_srcs_local `when` is_image)
141 where
142 -- old_src -> new_src
143 change_src_func :: String -> String
144 change_src_func old_src =
145 case Map.lookup old_src image_map of
146 -- Leave it alone if we don't have the file locally
147 Nothing -> old_src
148 Just v -> v
149
150 change_src :: (ArrowXml a) => a XmlTree XmlTree
151 change_src =
152 changeAttrValue change_src_func
153
154 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
155 make_srcs_local =
156 processAttrl $ (change_src `when` (hasName "src"))
157
158
159 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
160 parse xml = do
161 let clean_xml = xml >>> preprocess
162 image_map <- download_images clean_xml
163 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
164 appr <- ap_parse local_xml
165 fppr <- fp_parse local_xml
166 return $
167 if (isNothing appr) then
168 fppr
169 else
170 appr
171
172
173
174 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
175 parse_headline xml = do
176 let element_filter = xml >>> css "div.PageHeadline h1"
177 let element_text_filter = element_filter /> getText
178 element_text <- runX element_text_filter
179 return $
180 case element_text of
181 [x] -> Just $ strip x
182 [] -> Nothing
183 _ -> error "Found more than one headline."
184
185
186 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
187 parse_byline xml = do
188 let element_filter = xml >>> css "div.FeatureByLine"
189 let element_text_filter = element_filter /> getText
190 element_text <- runX element_text_filter
191 return $
192 case element_text of
193 [x] -> Just $ strip x
194 [] -> Nothing
195 _ -> error "Found more than one article byline."
196
197
198 --
199 -- ArticlePage Stuff
200 --
201 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
202 ap_parse xml = do
203 arts <- ap_parse_articles xml
204 case arts of
205 [x] -> return $ Just $ ArticlePage x
206 _ -> return Nothing
207
208
209 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
210 ap_parse_body xml = do
211 let element_filter = xml >>> css "div.ArticleText"
212 let element_html_filter = xshow element_filter
213 element_html <- runX element_html_filter
214 return $ case element_html of
215 [x] -> Just x
216 [] -> Nothing
217 _ -> error "Found more than one article."
218
219
220 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
221 ap_parse_articles xml = do
222 parsed_headline <- parse_headline xml
223 parsed_byline <- parse_byline xml
224 parsed_body <- ap_parse_body xml
225
226 putStrLn $ fromJust parsed_headline
227
228 if (isNothing parsed_headline) || (isNothing parsed_body)
229 then return []
230 else do
231 let title' = Title $ fromJust parsed_headline
232 let byline' = Byline parsed_byline
233 let body' = BodyHtml $ fromJust parsed_body
234
235 return $ [Article title' byline' body']
236
237
238
239 --
240 -- FullPage Stuff
241 --
242
243 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
244 fp_parse xml = do
245 hl <- parse_headline xml
246 parsed_articles <- fp_parse_articles xml
247 case parsed_articles of
248 [] -> return Nothing
249 x -> return $ Just $ FullPage (fromJust hl) x
250
251
252
253 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
254 fp_parse_article_title xml = do
255 let element_filter = xml >>> css "h2.SummaryHL"
256 let element_text_filter = element_filter //> getText
257 element_text <- runX element_text_filter
258 return $ case element_text of
259 [x] -> Just $ strip x
260 [] -> Nothing
261 _ -> error "Found more than one article title."
262
263
264
265 is_title :: (ArrowXml a) => a XmlTree XmlTree
266 is_title =
267 (hasName "h2")
268 >>>
269 (hasAttrValue "class" (== "SummaryHL"))
270
271
272 is_byline :: (ArrowXml a) => a XmlTree XmlTree
273 is_byline =
274 (hasName "div")
275 >>>
276 (hasAttrValue "class" (== "FeatureByLine"))
277
278
279 is_image :: (ArrowXml a) => a XmlTree XmlTree
280 is_image = isElem >>> hasName "img"
281
282 remove_title :: (ArrowXml a) => a XmlTree XmlTree
283 remove_title =
284 processTopDown ((none) `when` is_title)
285
286
287 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
288 remove_byline =
289 processTopDown ((none) `when` is_byline)
290
291
292
293 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
294 fp_parse_article_body xml = do
295 -- First, delete the article title and byline.
296 let clean_xml' = xml >>> remove_title >>> remove_byline
297 -- The only child of the body element should be a div.lwn-article
298 -- since we wrapped the article's HTML in that.
299 let clean_xml = clean_xml' >>> css "body" >>> getChildren
300 clean_html <- runX . xshow $ clean_xml
301 return $ case clean_html of
302 [x] -> Just x
303 [] -> Nothing
304 _ -> error "Found more than one article body."
305
306 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
307 fp_parse_article xml = do
308 parsed_article_title <- fp_parse_article_title xml
309 parsed_article_byline <- parse_byline xml
310 parsed_article_body <- fp_parse_article_body xml
311
312 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
313 then
314 return Nothing
315 else do
316 let title' = Title $ fromJust parsed_article_title
317 let byline' = Byline parsed_article_byline
318 let body' = BodyHtml $ fromJust parsed_article_body
319 return $ Just $ Article title' byline' body'
320
321 parse_html_article :: String -> IO (Maybe Article)
322 parse_html_article html = do
323 let xml = parseHtml $ wrap_in_body_div html
324 fp_parse_article xml
325
326
327 -- | In the full page, all of the article titles and bodies are
328 -- wrapped in one big div.ArticleText.
329 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
330 parse_bodies xml =
331 xml >>> css "div.ArticleText"
332
333
334 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
335 fp_parse_articles xml = do
336 bodies <- runX . xshow $ parse_bodies xml
337 let article_separator = "<h2 class=\"SummaryHL\">"
338 let split_articles'' = split article_separator (concat bodies)
339 -- The first element will contain the crap before the first <h2...>.
340 let split_articles' = case split_articles'' of
341 (_:_) -> tail split_articles''
342 [] -> []
343 -- Put the separator back, it was lost during the split.
344 let split_articles = map (article_separator ++) split_articles'
345 --_ <- mapM print_article split_articles
346 real_articles <- mapM parse_html_article split_articles
347 let just_articles = catMaybes real_articles
348 return just_articles
349
350
351 -- | This makes it easy to select otherwise-random chunks of html
352 -- using 'css'.
353 wrap_in_body_div :: String -> String
354 wrap_in_body_div s =
355 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
356
357
358
359
360 --
361 -- Epublishable stuff
362 --
363
364 title :: Page -> String
365 title (ArticlePage a) = getTitle $ LWN.Article.title a
366 title (FullPage hl _) = hl
367
368
369 metadata :: Page -> IO String
370 metadata obj = do
371 date <- getCurrentTime
372 return $
373 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
374 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
375 "<dc:language>en-US</dc:language>\n" ++
376 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
377 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
378
379
380 epublish :: Page -> Handle -> IO ()
381 epublish obj handle = do
382 let xhtml = to_xhtml obj
383 epmd <- metadata obj
384 epub <- xhtml_to_epub epmd xhtml
385 B.hPut handle epub
386 hFlush handle
387 hClose handle
388
389 xhtml_to_epub :: String -> String -> IO B.ByteString
390 xhtml_to_epub epmd =
391 write_epub . read_html
392 where
393 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
394 write_epub = writeEPUB Nothing [] my_writer_options
395 read_html = readHtml defaultParserState
396
397
398
399 --
400 -- Misc
401 --
402
403 image_srcs :: (ArrowXml a) => a XmlTree URL
404 image_srcs =
405 css "img"
406 >>>
407 getAttrValue "src"
408
409 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
410 make_image_srcs_absolute =
411 processTopDown (make_srcs_absolute `when` is_image)
412 where
413 change_src :: (ArrowXml a) => a XmlTree XmlTree
414 change_src =
415 changeAttrValue try_make_absolute_url
416
417 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
418 make_srcs_absolute =
419 processAttrl $ change_src `when` hasName "src"
420
421
422 --
423 -- Tests
424 --
425
426 test_preprocess_links :: Assertion
427 test_preprocess_links = do
428 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
429 let actual_xml = actual_xml' !! 0
430
431 expected_xml' <- runX $ expected_xml'' >>> css "body"
432 let expected_xml = expected_xml' !! 0
433
434 assertEqual
435 "Links are replaced with spans"
436 expected_xml
437 actual_xml
438 where
439 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
440 input_xml = parseHtml input_html
441 expected_html = "<body><span>Hello, world!</span></body>"
442 expected_xml'' = parseHtml expected_html
443
444
445 test_absolve_images :: Assertion
446 test_absolve_images = do
447 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
448 let actual_xml = actual_xml' !! 0
449
450 expected_xml' <- runX $ expected_xml'' >>> css "body"
451 let expected_xml = expected_xml' !! 0
452
453 assertEqual
454 "Image srcs are made absolute"
455 expected_xml
456 actual_xml
457 where
458 input_html =
459 "<body>" ++
460 "<img src=\"/images/2012/example.jpg\" />" ++
461 "</body>"
462 input_xml = parseHtml input_html
463 expected_html =
464 "<body>" ++
465 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
466 "</body>"
467 expected_xml'' = parseHtml expected_html
468
469
470 test_comments_removed :: Assertion
471 test_comments_removed = do
472 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
473 let actual_xml = actual_xml' !! 0
474
475 expected_xml' <- runX $ expected_xml'' >>> css "body"
476 let expected_xml = expected_xml' !! 0
477
478 assertEqual
479 "Comment links are removed"
480 expected_xml
481 actual_xml
482 where
483 input_html =
484 "<body><p>" ++
485 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
486 "</p></body>"
487 input_xml = parseHtml input_html
488
489 expected_html = "<body><p></p></body>"
490 expected_xml'' = parseHtml expected_html
491
492
493
494 page_tests :: Test
495 page_tests =
496 testGroup "Page Tests" [
497 testCase "Links are replaced with spans" test_preprocess_links,
498 testCase "Image srcs are made absolute" test_absolve_images,
499 testCase "Comment links are removed" test_comments_removed ]