Add config file parsing.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import qualified Data.Map as Map
7 import Data.Time (getCurrentTime)
8 import System.IO (Handle)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import Data.Maybe (catMaybes, fromJust, isNothing)
12 import Data.Tree.NTree.TypeDefs (NTree)
13 import Test.HUnit (Assertion, assertEqual)
14 import Test.Framework (Test, testGroup)
15 import Test.Framework.Providers.HUnit (testCase)
16 import Text.Pandoc
17 import Text.XML.HXT.Core (
18 ArrowXml,
19 IOSArrow,
20 XmlTree,
21 XNode,
22 (>>>),
23 (/>),
24 (//>),
25 changeAttrValue,
26 getAttrValue,
27 getChildren,
28 getText,
29 hasAttrValue,
30 hasName,
31 isElem,
32 mkName,
33 none,
34 processAttrl,
35 processTopDown,
36 runX,
37 setElemName,
38 xshow,
39 when
40 )
41 import Text.HandsomeSoup (css, parseHtml)
42
43 import LWN.Article
44 import LWN.HTTP (save_image)
45 import LWN.URI (URL, try_make_absolute_url)
46 import Misc (contains)
47 import XHTML
48
49 -- Map absolute image URLs to local system file paths where the image
50 -- referenced by the URL is stored.
51 type ImageMap = Map.Map URL FilePath
52
53 -- Should be called *after* preprocessing.
54 download_images :: IOSArrow XmlTree (NTree XNode) -> IO ImageMap
55 download_images xml = do
56 image_urls <- runX $ xml >>> image_srcs
57 files <- mapM save_image image_urls
58 let pairs = zip image_urls files
59 return $ foldl my_insert empty_map pairs
60 where
61 empty_map = Map.empty :: ImageMap
62
63 my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap
64 my_insert dict (_, Nothing) = dict
65 my_insert dict (k, Just v) = Map.insert k v dict
66
67
68 data Page =
69 -- | An LWN page with one article on it.
70 ArticlePage { article :: Article } |
71
72 -- | An LWN page with more than one article on it. These require
73 -- different parsing and display functions than the single-article
74 -- pages.
75 FullPage { headline :: String,
76 articles :: [Article] }
77
78
79 instance XHTML Page where
80 to_xhtml (ArticlePage a) =
81 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
82 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
83 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
84 "<head>" ++
85 " <meta http-equiv=\"Content-Type\"" ++
86 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
87 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
88 "</head>" ++
89 "<body>" ++
90 "<div>" ++
91 (to_xhtml a) ++
92 "</div>" ++
93 "</body>" ++
94 "</html>"
95
96 to_xhtml (FullPage hl as) =
97 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
98 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
99 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
100 "<head>" ++
101 " <meta http-equiv=\"Content-Type\"" ++
102 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
103 " <title>" ++ hl ++ "</title>" ++
104 "</head>" ++
105 "<body>" ++
106 "<div>" ++
107 "<h1>" ++ hl ++ "</h1>" ++
108 (concatMap to_xhtml as) ++
109 "</div>" ++
110 "</body>" ++
111 "</html>"
112
113
114
115 is_link :: (ArrowXml a) => a XmlTree XmlTree
116 is_link =
117 isElem >>> hasName "a"
118
119
120 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
121 remove_comment_links =
122 processTopDown $ kill_comments `when` is_link
123 where
124 is_comment_link =
125 hasAttrValue "href" (contains "#Comments")
126
127 kill_comments =
128 none `when` is_comment_link
129
130 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
131 replace_links_with_spans =
132 processTopDown $ (make_span >>> remove_attrs) `when` is_link
133 where
134 make_span = setElemName $ mkName "span"
135 remove_attrs = processAttrl none
136
137
138 -- | Preprocessing common to both page types.
139 preprocess :: (ArrowXml a) => a XmlTree XmlTree
140 preprocess =
141 make_image_srcs_absolute
142 >>>
143 remove_comment_links
144 >>>
145 replace_links_with_spans
146
147
148 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
149 replace_remote_img_srcs image_map =
150 processTopDown (make_srcs_local `when` is_image)
151 where
152 -- old_src -> new_src
153 change_src_func :: String -> String
154 change_src_func old_src =
155 case Map.lookup old_src image_map of
156 -- Leave it alone if we don't have the file locally
157 Nothing -> old_src
158 Just v -> v
159
160 change_src :: (ArrowXml a) => a XmlTree XmlTree
161 change_src =
162 changeAttrValue change_src_func
163
164 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
165 make_srcs_local =
166 processAttrl $ (change_src `when` (hasName "src"))
167
168
169 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
170 parse xml = do
171 let clean_xml = xml >>> preprocess
172 image_map <- download_images clean_xml
173 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
174 appr <- ap_parse local_xml
175 fppr <- fp_parse local_xml
176 return $
177 if (isNothing appr) then
178 fppr
179 else
180 appr
181
182
183
184 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
185 parse_headline xml = do
186 let element_filter = xml >>> css "div.PageHeadline h1"
187 let element_text_filter = element_filter /> getText
188 element_text <- runX element_text_filter
189 return $
190 case element_text of
191 [x] -> Just $ strip x
192 [] -> Nothing
193 _ -> error "Found more than one headline."
194
195
196 parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
197 parse_byline xml = do
198 let element_filter = xml >>> css "div.FeatureByLine"
199 let element_text_filter = element_filter /> getText
200 element_text <- runX element_text_filter
201 return $
202 case element_text of
203 [x] -> Just $ strip x
204 [] -> Nothing
205 _ -> error "Found more than one article byline."
206
207
208 --
209 -- ArticlePage Stuff
210 --
211 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
212 ap_parse xml = do
213 arts <- ap_parse_articles xml
214 case arts of
215 [x] -> return $ Just $ ArticlePage x
216 _ -> return Nothing
217
218
219 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
220 ap_parse_body xml = do
221 let element_filter = xml >>> css "div.ArticleText"
222 let element_html_filter = xshow element_filter
223 element_html <- runX element_html_filter
224 return $ case element_html of
225 [x] -> Just x
226 [] -> Nothing
227 _ -> error "Found more than one article."
228
229
230 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
231 ap_parse_articles xml = do
232 parsed_headline <- parse_headline xml
233 parsed_byline <- parse_byline xml
234 parsed_body <- ap_parse_body xml
235
236 putStrLn $ fromJust parsed_headline
237
238 if (isNothing parsed_headline) || (isNothing parsed_body)
239 then return []
240 else do
241 let title' = Title $ fromJust parsed_headline
242 let byline' = Byline parsed_byline
243 let body' = BodyHtml $ fromJust parsed_body
244
245 return $ [Article title' byline' body']
246
247
248
249 --
250 -- FullPage Stuff
251 --
252
253 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
254 fp_parse xml = do
255 hl <- parse_headline xml
256 parsed_articles <- fp_parse_articles xml
257 case parsed_articles of
258 [] -> return Nothing
259 x -> return $ Just $ FullPage (fromJust hl) x
260
261
262
263 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
264 fp_parse_article_title xml = do
265 let element_filter = xml >>> css "h2.SummaryHL"
266 let element_text_filter = element_filter //> getText
267 element_text <- runX element_text_filter
268 return $ case element_text of
269 [x] -> Just $ strip x
270 [] -> Nothing
271 _ -> error "Found more than one article title."
272
273
274
275 is_title :: (ArrowXml a) => a XmlTree XmlTree
276 is_title =
277 (hasName "h2")
278 >>>
279 (hasAttrValue "class" (== "SummaryHL"))
280
281
282 is_byline :: (ArrowXml a) => a XmlTree XmlTree
283 is_byline =
284 (hasName "div")
285 >>>
286 (hasAttrValue "class" (== "FeatureByLine"))
287
288
289 is_image :: (ArrowXml a) => a XmlTree XmlTree
290 is_image = isElem >>> hasName "img"
291
292 remove_title :: (ArrowXml a) => a XmlTree XmlTree
293 remove_title =
294 processTopDown ((none) `when` is_title)
295
296
297 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
298 remove_byline =
299 processTopDown ((none) `when` is_byline)
300
301
302
303 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
304 fp_parse_article_body xml = do
305 -- First, delete the article title and byline.
306 let clean_xml' = xml >>> remove_title >>> remove_byline
307 -- The only child of the body element should be a div.lwn-article
308 -- since we wrapped the article's HTML in that.
309 let clean_xml = clean_xml' >>> css "body" >>> getChildren
310 clean_html <- runX . xshow $ clean_xml
311 return $ case clean_html of
312 [x] -> Just x
313 [] -> Nothing
314 _ -> error "Found more than one article body."
315
316 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
317 fp_parse_article xml = do
318 parsed_article_title <- fp_parse_article_title xml
319 parsed_article_byline <- parse_byline xml
320 parsed_article_body <- fp_parse_article_body xml
321
322 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
323 then
324 return Nothing
325 else do
326 let title' = Title $ fromJust parsed_article_title
327 let byline' = Byline parsed_article_byline
328 let body' = BodyHtml $ fromJust parsed_article_body
329 return $ Just $ Article title' byline' body'
330
331 parse_html_article :: String -> IO (Maybe Article)
332 parse_html_article html = do
333 let xml = parseHtml $ wrap_in_body_div html
334 fp_parse_article xml
335
336
337 -- | In the full page, all of the article titles and bodies are
338 -- wrapped in one big div.ArticleText.
339 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
340 parse_bodies xml =
341 xml >>> css "div.ArticleText"
342
343
344 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
345 fp_parse_articles xml = do
346 bodies <- runX . xshow $ parse_bodies xml
347 let article_separator = "<h2 class=\"SummaryHL\">"
348 let split_articles'' = split article_separator (concat bodies)
349 -- The first element will contain the crap before the first <h2...>.
350 let split_articles' = case split_articles'' of
351 (_:_) -> tail split_articles''
352 [] -> []
353 -- Put the separator back, it was lost during the split.
354 let split_articles = map (article_separator ++) split_articles'
355 --_ <- mapM print_article split_articles
356 real_articles <- mapM parse_html_article split_articles
357 let just_articles = catMaybes real_articles
358 return just_articles
359
360
361 -- | This makes it easy to select otherwise-random chunks of html
362 -- using 'css'.
363 wrap_in_body_div :: String -> String
364 wrap_in_body_div s =
365 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
366
367
368
369
370 --
371 -- Epublishable stuff
372 --
373
374 title :: Page -> String
375 title (ArticlePage a) = getTitle $ LWN.Article.title a
376 title (FullPage hl _) = hl
377
378
379 metadata :: Page -> IO String
380 metadata obj = do
381 date <- getCurrentTime
382 return $
383 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
384 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
385 "<dc:language>en-US</dc:language>\n" ++
386 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
387 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
388
389
390 epublish :: Page -> Handle -> IO ()
391 epublish obj handle = do
392 let xhtml = to_xhtml obj
393 epmd <- metadata obj
394 epub <- xhtml_to_epub epmd xhtml
395 B.hPut handle epub
396
397
398 xhtml_to_epub :: String -> String -> IO B.ByteString
399 xhtml_to_epub epmd =
400 write_epub . read_html
401 where
402 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
403 write_epub = writeEPUB Nothing [] my_writer_options
404 read_html = readHtml defaultParserState
405
406
407
408 --
409 -- Misc
410 --
411
412 image_srcs :: (ArrowXml a) => a XmlTree URL
413 image_srcs =
414 css "img"
415 >>>
416 getAttrValue "src"
417
418 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
419 make_image_srcs_absolute =
420 processTopDown (make_srcs_absolute `when` is_image)
421 where
422 change_src :: (ArrowXml a) => a XmlTree XmlTree
423 change_src =
424 changeAttrValue try_make_absolute_url
425
426 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
427 make_srcs_absolute =
428 processAttrl $ change_src `when` hasName "src"
429
430
431 --
432 -- Tests
433 --
434
435 test_preprocess_links :: Assertion
436 test_preprocess_links = do
437 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
438 let actual_xml = actual_xml' !! 0
439
440 expected_xml' <- runX $ expected_xml'' >>> css "body"
441 let expected_xml = expected_xml' !! 0
442
443 assertEqual
444 "Links are replaced with spans"
445 expected_xml
446 actual_xml
447 where
448 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
449 input_xml = parseHtml input_html
450 expected_html = "<body><span>Hello, world!</span></body>"
451 expected_xml'' = parseHtml expected_html
452
453
454 test_absolve_images :: Assertion
455 test_absolve_images = do
456 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
457 let actual_xml = actual_xml' !! 0
458
459 expected_xml' <- runX $ expected_xml'' >>> css "body"
460 let expected_xml = expected_xml' !! 0
461
462 assertEqual
463 "Image srcs are made absolute"
464 expected_xml
465 actual_xml
466 where
467 input_html =
468 "<body>" ++
469 "<img src=\"/images/2012/example.jpg\" />" ++
470 "</body>"
471 input_xml = parseHtml input_html
472 expected_html =
473 "<body>" ++
474 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
475 "</body>"
476 expected_xml'' = parseHtml expected_html
477
478
479 test_comments_removed :: Assertion
480 test_comments_removed = do
481 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
482 let actual_xml = actual_xml' !! 0
483
484 expected_xml' <- runX $ expected_xml'' >>> css "body"
485 let expected_xml = expected_xml' !! 0
486
487 assertEqual
488 "Comment links are removed"
489 expected_xml
490 actual_xml
491 where
492 input_html =
493 "<body><p>" ++
494 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
495 "</p></body>"
496 input_xml = parseHtml input_html
497
498 expected_html = "<body><p></p></body>"
499 expected_xml'' = parseHtml expected_html
500
501
502
503 page_tests :: Test
504 page_tests =
505 testGroup "Page Tests" [
506 testCase "Links are replaced with spans" test_preprocess_links,
507 testCase "Image srcs are made absolute" test_absolve_images,
508 testCase "Comment links are removed" test_comments_removed ]