Return-style fix.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.Map as Map (lookup)
8 import Data.Time (getCurrentTime)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import qualified Data.Map as Map (Map, empty, insert)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Prelude hiding (readFile)
14 import System.IO (Handle, hClose, hFlush)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
18 import Text.Pandoc (
19 defaultParserState,
20 defaultWriterOptions,
21 readHtml,
22 writeEPUB,
23 writerEPUBMetadata,
24 writerUserDataDir)
25 import Text.Pandoc.Shared ( readDataFile )
26 import Text.XML.HXT.Core (
27 ArrowXml,
28 IOSArrow,
29 XmlTree,
30 ($<),
31 (>>>),
32 (/>),
33 (//>),
34 changeAttrValue,
35 getAttrValue,
36 getChildren,
37 getText,
38 hasName,
39 none,
40 processAttrl,
41 processTopDown,
42 this,
43 runX,
44 xshow,
45 when)
46 import Text.HandsomeSoup (css, parseHtml)
47
48 import Configuration (Cfg, full_stories)
49 import LWN.Article
50 import LWN.HTTP (
51 ImageMap,
52 download_image_urls,
53 get_article_contents)
54 import LWN.URI (URL)
55 import LWN.XHTML (
56 XHTML,
57 full_story_urls,
58 image_srcs,
59 full_story_link,
60 full_story_paragraph,
61 is_image,
62 preprocess,
63 remove_byline,
64 remove_full_story_paragraphs,
65 remove_title,
66 to_xhtml,
67 to_xml,
68 xml_from_contents)
69
70
71
72
73 data Page =
74 -- | An LWN page with one article on it.
75 ArticlePage { article :: Article } |
76
77 -- | An LWN page with more than one article on it. These require
78 -- different parsing and display functions than the single-article
79 -- pages.
80 FullPage { headline :: String,
81 articles :: [Article] }
82
83
84 instance XHTML Page where
85 to_xhtml (ArticlePage a) =
86 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
87 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
88 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
89 "<head>" ++
90 " <meta http-equiv=\"Content-Type\"" ++
91 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
92 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
93 "</head>" ++
94 "<body>" ++
95 (to_xhtml a) ++
96 "</body>" ++
97 "</html>"
98
99 to_xhtml (FullPage hl as) =
100 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
101 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
102 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
103 "<head>" ++
104 " <meta http-equiv=\"Content-Type\"" ++
105 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
106 " <title>" ++ hl ++ "</title>" ++
107 "</head>" ++
108 "<body>" ++
109 "<div>" ++
110 "<h1>" ++ hl ++ "</h1>" ++
111 (concatMap to_xhtml as) ++
112 "</div>" ++
113 "</body>" ++
114 "</html>"
115
116
117
118 -- | Stolen from writeEPUB.
119 default_stylesheet :: IO String
120 default_stylesheet =
121 -- This comes with Pandoc, I guess.
122 readDataFile (writerUserDataDir defaultWriterOptions) "epub.css"
123
124
125 construct_stylesheet :: IO String
126 construct_stylesheet = do
127 defaults <- default_stylesheet
128 -- Allow word-wrapping in <pre> elements.
129 let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
130 return $ defaults ++ my_additions
131
132 page_from_url :: Cfg -> URL -> IO (Maybe Page)
133 page_from_url cfg url = do
134 contents <- get_article_contents cfg url
135 case (xml_from_contents contents) of
136 Just html -> parse cfg html
137 Nothing -> return Nothing
138
139
140
141 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
142 insert_full_stories story_map =
143 processTopDown (article_xml `when` full_story_paragraph)
144 where
145 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
146 lookup_func href =
147 case Map.lookup href story_map of
148 -- Drop the paragraph if we don't have the contents.
149 Nothing -> none
150 Just v -> to_xml v
151
152 article_xml :: (ArrowXml a) => a XmlTree XmlTree
153 article_xml =
154 lookup_func
155 $< -- From HXT's Control.Arrow.ArrowList
156 (this /> full_story_link >>> getAttrValue "href")
157
158 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
159 replace_remote_img_srcs image_map =
160 processTopDown (make_srcs_local `when` is_image)
161 where
162 -- old_src -> new_src
163 change_src_func :: String -> String
164 change_src_func old_src =
165 case Map.lookup old_src image_map of
166 -- Leave it alone if we don't have the file locally
167 Nothing -> old_src
168 Just v -> v
169
170 change_src :: (ArrowXml a) => a XmlTree XmlTree
171 change_src =
172 changeAttrValue change_src_func
173
174 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
175 make_srcs_local =
176 processAttrl $ (change_src `when` (hasName "src"))
177
178
179
180
181 -- Should be called *after* preprocessing.
182 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
183 download_images xml = do
184 image_urls <- runX $ xml >>> image_srcs
185 download_image_urls image_urls
186
187
188
189 type StoryMap = Map.Map URL Article
190
191 -- These come *before* preprocessing.
192 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
193 download_full_story_urls cfg story_urls = do
194 pages <- parallel $ map (page_from_url cfg) story_urls
195 let pairs = zip story_urls pages
196 return $ foldl my_insert empty_map pairs
197 where
198 empty_map = Map.empty :: StoryMap
199
200 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
201 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
202 my_insert dict (_, _) = dict
203
204
205 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
206 download_full_stories cfg xml = do
207 story_urls <- runX $ xml >>> full_story_urls
208 download_full_story_urls cfg story_urls
209
210
211 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
212 parse cfg xml = do
213 fs_xml <- if (full_stories cfg) then do
214 story_map <- download_full_stories cfg xml
215 return $ xml >>> insert_full_stories story_map
216 else do
217 -- Get rid of them if we don't want them.
218 return $ xml >>> remove_full_story_paragraphs
219
220 let clean_xml = fs_xml >>> preprocess
221 image_map <- download_images clean_xml
222 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
223
224 appr <- ap_parse local_xml
225 fppr <- fp_parse local_xml
226 return $
227 if (isNothing appr) then
228 fppr
229 else
230 appr
231
232
233
234 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
235 parse_headline xml = do
236 let element_filter = xml >>> css "div.PageHeadline h1"
237 let element_text_filter = element_filter /> getText
238 element_text <- runX element_text_filter
239 return $
240 case element_text of
241 [x] -> Just $ strip x
242 [] -> Nothing
243 _ -> error "Found more than one headline."
244
245
246 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
247 parse_byline xml = do
248 let element_filter = xml >>> css "div.FeatureByLine"
249 let element_text_filter = element_filter /> getText
250 element_text <- runX element_text_filter
251 return $
252 case element_text of
253 [x] -> Just $ strip x
254 [] -> Nothing
255 _ -> error "Found more than one article byline."
256
257
258 --
259 -- ArticlePage Stuff
260 --
261 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
262 ap_parse xml = do
263 arts <- ap_parse_articles xml
264 case arts of
265 [x] -> return $ Just $ ArticlePage x
266 _ -> return Nothing
267
268
269 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
270 ap_parse_body xml = do
271 let element_filter = xml >>> css "div.ArticleText"
272 let element_html_filter = xshow element_filter
273 element_html <- runX element_html_filter
274 return $ case element_html of
275 [x] -> Just x
276 [] -> Nothing
277 _ -> error "Found more than one article."
278
279
280 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
281 ap_parse_articles xml = do
282 parsed_headline <- parse_headline xml
283 parsed_byline <- parse_byline xml
284 parsed_body <- ap_parse_body xml
285
286 if (isNothing parsed_headline) || (isNothing parsed_body)
287 then return []
288 else do
289 let title' = Title $ fromJust parsed_headline
290 let byline' = Byline parsed_byline
291 let body' = BodyHtml $ fromJust parsed_body
292
293 return $ [Article title' byline' body']
294
295
296
297 --
298 -- FullPage Stuff
299 --
300
301 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
302 fp_parse xml = do
303 hl <- parse_headline xml
304 parsed_articles <- fp_parse_articles xml
305 return $ case parsed_articles of
306 [] -> Nothing
307 x -> Just $ FullPage (fromJust hl) x
308
309
310
311 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
312 fp_parse_article_title xml = do
313 let element_filter = xml >>> css "h2.SummaryHL"
314 let element_text_filter = element_filter //> getText
315 element_text <- runX element_text_filter
316 return $ case element_text of
317 [x] -> Just $ strip x
318 [] -> Nothing
319 _ -> error "Found more than one article title."
320
321
322
323
324 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
325 fp_parse_article_body xml = do
326 -- First, delete the article title and byline.
327 let clean_xml' = xml >>> remove_title >>> remove_byline
328 -- The only child of the body element should be a div.lwn-article
329 -- since we wrapped the article's HTML in that.
330 let clean_xml = clean_xml' >>> css "body" >>> getChildren
331 clean_html <- runX . xshow $ clean_xml
332 return $ case clean_html of
333 [x] -> Just x
334 [] -> Nothing
335 _ -> error "Found more than one article body."
336
337 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
338 fp_parse_article xml = do
339 parsed_article_title <- fp_parse_article_title xml
340 parsed_article_byline <- parse_byline xml
341 parsed_article_body <- fp_parse_article_body xml
342
343 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
344 then
345 return Nothing
346 else do
347 let title' = Title $ fromJust parsed_article_title
348 let byline' = Byline parsed_article_byline
349 let body' = BodyHtml $ fromJust parsed_article_body
350 return $ Just $ Article title' byline' body'
351
352 parse_html_article :: String -> IO (Maybe Article)
353 parse_html_article html = do
354 let xml = parseHtml $ wrap_in_body_div html
355 fp_parse_article xml
356
357
358 -- | In the full page, all of the article titles and bodies are
359 -- wrapped in one big div.ArticleText.
360 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
361 parse_bodies xml =
362 xml >>> css "div.ArticleText"
363
364
365 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
366 fp_parse_articles xml = do
367 bodies <- runX . xshow $ parse_bodies xml
368 let article_separator = "<h2 class=\"SummaryHL\">"
369 let split_articles'' = split article_separator (concat bodies)
370 -- The first element will contain the crap before the first <h2...>.
371 let split_articles' = case split_articles'' of
372 (_:_) -> tail split_articles''
373 [] -> []
374 -- Put the separator back, it was lost during the split.
375 let split_articles = map (article_separator ++) split_articles'
376 real_articles <- mapM parse_html_article split_articles
377 let just_articles = catMaybes real_articles
378 return just_articles
379
380
381 -- | This makes it easy to select otherwise-random chunks of html
382 -- using 'css'.
383 wrap_in_body_div :: String -> String
384 wrap_in_body_div s =
385 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
386
387
388
389
390 --
391 -- Epublishable stuff
392 --
393
394 title :: Page -> String
395 title (ArticlePage a) = getTitle $ LWN.Article.title a
396 title (FullPage hl _) = hl
397
398
399 metadata :: Page -> IO String
400 metadata obj = do
401 date <- getCurrentTime
402 return $
403 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
404 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
405 "<dc:language>en-US</dc:language>\n" ++
406 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
407 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
408
409
410 epublish :: Page -> Handle -> IO ()
411 epublish obj handle = do
412 let xhtml = to_xhtml obj
413 epmd <- metadata obj
414 epub <- xhtml_to_epub epmd xhtml
415 B.hPut handle epub
416 hFlush handle
417 hClose handle
418
419 xhtml_to_epub :: String -> String -> IO B.ByteString
420 xhtml_to_epub epmd xhtml = do
421 stylesheet <- construct_stylesheet
422 writeEPUB
423 (Just stylesheet)
424 []
425 my_writer_options
426 (read_html xhtml)
427 where
428 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
429 read_html = readHtml defaultParserState
430
431
432 --
433 -- Tests
434 --
435
436 test_preprocess_links :: Assertion
437 test_preprocess_links = do
438 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
439 let actual_xml = actual_xml' !! 0
440
441 expected_xml' <- runX $ expected_xml'' >>> css "body"
442 let expected_xml = expected_xml' !! 0
443
444 assertEqual
445 "Links are replaced with spans"
446 expected_xml
447 actual_xml
448 where
449 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
450 input_xml = parseHtml input_html
451 expected_html = "<body><span>Hello, world!</span></body>"
452 expected_xml'' = parseHtml expected_html
453
454
455 test_absolve_images :: Assertion
456 test_absolve_images = do
457 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
458 let actual_xml = actual_xml' !! 0
459
460 expected_xml' <- runX $ expected_xml'' >>> css "body"
461 let expected_xml = expected_xml' !! 0
462
463 assertEqual
464 "Image srcs are made absolute"
465 expected_xml
466 actual_xml
467 where
468 input_html =
469 "<body>" ++
470 "<img src=\"/images/2012/example.jpg\" />" ++
471 "</body>"
472 input_xml = parseHtml input_html
473 expected_html =
474 "<body>" ++
475 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
476 "</body>"
477 expected_xml'' = parseHtml expected_html
478
479
480 test_comments_removed :: Assertion
481 test_comments_removed = do
482 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
483 let actual_xml = actual_xml' !! 0
484
485 expected_xml' <- runX $ expected_xml'' >>> css "body"
486 let expected_xml = expected_xml' !! 0
487
488 assertEqual
489 "Comment links are removed"
490 expected_xml
491 actual_xml
492 where
493 input_html =
494 "<body><p>" ++
495 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
496 "</p></body>"
497 input_xml = parseHtml input_html
498
499 expected_html = "<body><p></p></body>"
500 expected_xml'' = parseHtml expected_html
501
502
503 test_full_story_urls_parsed :: Assertion
504 test_full_story_urls_parsed = do
505 actual <- runX $ actual'
506
507 assertEqual
508 "Full Story URLs are parsed"
509 expected
510 actual
511 where
512 expected = ["/Articles/500738/", "/Articles/501837/"]
513
514 full_story_html =
515 concat ["<p>",
516 "<a href=\"/Articles/500738/\">Full Story</a> ",
517 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
518 "<p>",
519 "<a href=\"/Articles/501837/\">Full Story</a> ",
520 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
521 "<p>"]
522
523 xml = parseHtml full_story_html
524 actual' = xml >>> full_story_urls
525
526 page_tests :: Test
527 page_tests =
528 testGroup "Page Tests" [
529 testCase "Links are replaced with spans" test_preprocess_links,
530 testCase "Image srcs are made absolute" test_absolve_images,
531 testCase "Comment links are removed" test_comments_removed,
532 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]