49faa0bb4d6d1b0c77c0b4c6eb8d00e1a9261c50
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.Map as Map (lookup)
8 import Data.Time (getCurrentTime)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import qualified Data.Map as Map (Map, empty, insert)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Prelude hiding (readFile)
14 import System.IO (Handle, hClose, hFlush)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
18 import Text.Pandoc (
19 defaultParserState,
20 defaultWriterOptions,
21 readHtml,
22 writeEPUB,
23 writerEPUBMetadata)
24 import Text.XML.HXT.Core (
25 ArrowXml,
26 IOSArrow,
27 XmlTree,
28 ($<),
29 (>>>),
30 (/>),
31 (//>),
32 changeAttrValue,
33 getAttrValue,
34 getChildren,
35 getText,
36 hasName,
37 processAttrl,
38 processTopDown,
39 this,
40 runX,
41 xshow,
42 when)
43 import Text.HandsomeSoup (css, parseHtml)
44
45 import Configuration (Cfg)
46 import LWN.Article
47 import LWN.HTTP (
48 ImageMap,
49 download_image_urls,
50 get_article_contents)
51 import LWN.URI (URL)
52 import LWN.XHTML (
53 XHTML,
54 full_story_urls,
55 image_srcs,
56 full_story_link,
57 full_story_paragraph,
58 is_image,
59 preprocess,
60 remove_byline,
61 remove_title,
62 to_xhtml,
63 to_xml,
64 xml_from_contents)
65
66
67
68
69 data Page =
70 -- | An LWN page with one article on it.
71 ArticlePage { article :: Article } |
72
73 -- | An LWN page with more than one article on it. These require
74 -- different parsing and display functions than the single-article
75 -- pages.
76 FullPage { headline :: String,
77 articles :: [Article] }
78
79
80 instance XHTML Page where
81 to_xhtml (ArticlePage a) =
82 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
83 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
84 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
85 "<head>" ++
86 " <meta http-equiv=\"Content-Type\"" ++
87 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
88 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
89 "</head>" ++
90 "<body>" ++
91 (to_xhtml a) ++
92 "</body>" ++
93 "</html>"
94
95 to_xhtml (FullPage hl as) =
96 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
97 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
98 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
99 "<head>" ++
100 " <meta http-equiv=\"Content-Type\"" ++
101 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
102 " <title>" ++ hl ++ "</title>" ++
103 "</head>" ++
104 "<body>" ++
105 "<div>" ++
106 "<h1>" ++ hl ++ "</h1>" ++
107 (concatMap to_xhtml as) ++
108 "</div>" ++
109 "</body>" ++
110 "</html>"
111
112
113
114 page_from_url :: Cfg -> URL -> IO (Maybe Page)
115 page_from_url cfg url = do
116 contents <- get_article_contents cfg url
117 case (xml_from_contents contents) of
118 Just html -> parse cfg html
119 Nothing -> return Nothing
120
121
122
123 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
124 insert_full_stories story_map =
125 processTopDown (article_xml `when` full_story_paragraph)
126 where
127 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
128 lookup_func href =
129 case Map.lookup href story_map of
130 -- Leave it alone if we don't have the full story.
131 Nothing -> this
132 Just v -> to_xml v
133
134 article_xml :: (ArrowXml a) => a XmlTree XmlTree
135 article_xml =
136 lookup_func
137 $<
138 (this /> full_story_link >>> getAttrValue "href")
139
140 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
141 replace_remote_img_srcs image_map =
142 processTopDown (make_srcs_local `when` is_image)
143 where
144 -- old_src -> new_src
145 change_src_func :: String -> String
146 change_src_func old_src =
147 case Map.lookup old_src image_map of
148 -- Leave it alone if we don't have the file locally
149 Nothing -> old_src
150 Just v -> v
151
152 change_src :: (ArrowXml a) => a XmlTree XmlTree
153 change_src =
154 changeAttrValue change_src_func
155
156 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
157 make_srcs_local =
158 processAttrl $ (change_src `when` (hasName "src"))
159
160
161
162
163 -- Should be called *after* preprocessing.
164 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
165 download_images xml = do
166 image_urls <- runX $ xml >>> image_srcs
167 download_image_urls image_urls
168
169
170
171 type StoryMap = Map.Map URL Article
172
173 -- These come *before* preprocessing.
174 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
175 download_full_story_urls cfg story_urls = do
176 pages <- parallel $ map (page_from_url cfg) story_urls
177 let pairs = zip story_urls pages
178 return $ foldl my_insert empty_map pairs
179 where
180 empty_map = Map.empty :: StoryMap
181
182 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
183 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
184 my_insert dict (_, _) = dict
185
186
187 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
188 download_full_stories cfg xml = do
189 story_urls <- runX $ xml >>> full_story_urls
190 download_full_story_urls cfg story_urls
191
192
193 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
194 parse cfg xml = do
195 story_map <- download_full_stories cfg xml
196 let fs_xml = xml >>> insert_full_stories story_map
197
198 let clean_xml = fs_xml >>> preprocess
199 image_map <- download_images clean_xml
200 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
201
202 appr <- ap_parse local_xml
203 fppr <- fp_parse local_xml
204 return $
205 if (isNothing appr) then
206 fppr
207 else
208 appr
209
210
211
212 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
213 parse_headline xml = do
214 let element_filter = xml >>> css "div.PageHeadline h1"
215 let element_text_filter = element_filter /> getText
216 element_text <- runX element_text_filter
217 return $
218 case element_text of
219 [x] -> Just $ strip x
220 [] -> Nothing
221 _ -> error "Found more than one headline."
222
223
224 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
225 parse_byline xml = do
226 let element_filter = xml >>> css "div.FeatureByLine"
227 let element_text_filter = element_filter /> getText
228 element_text <- runX element_text_filter
229 return $
230 case element_text of
231 [x] -> Just $ strip x
232 [] -> Nothing
233 _ -> error "Found more than one article byline."
234
235
236 --
237 -- ArticlePage Stuff
238 --
239 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
240 ap_parse xml = do
241 arts <- ap_parse_articles xml
242 case arts of
243 [x] -> return $ Just $ ArticlePage x
244 _ -> return Nothing
245
246
247 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
248 ap_parse_body xml = do
249 let element_filter = xml >>> css "div.ArticleText"
250 let element_html_filter = xshow element_filter
251 element_html <- runX element_html_filter
252 return $ case element_html of
253 [x] -> Just x
254 [] -> Nothing
255 _ -> error "Found more than one article."
256
257
258 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
259 ap_parse_articles xml = do
260 parsed_headline <- parse_headline xml
261 parsed_byline <- parse_byline xml
262 parsed_body <- ap_parse_body xml
263
264 if (isNothing parsed_headline) || (isNothing parsed_body)
265 then return []
266 else do
267 let title' = Title $ fromJust parsed_headline
268 let byline' = Byline parsed_byline
269 let body' = BodyHtml $ fromJust parsed_body
270
271 return $ [Article title' byline' body']
272
273
274
275 --
276 -- FullPage Stuff
277 --
278
279 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
280 fp_parse xml = do
281 hl <- parse_headline xml
282 parsed_articles <- fp_parse_articles xml
283 case parsed_articles of
284 [] -> return Nothing
285 x -> return $ Just $ FullPage (fromJust hl) x
286
287
288
289 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
290 fp_parse_article_title xml = do
291 let element_filter = xml >>> css "h2.SummaryHL"
292 let element_text_filter = element_filter //> getText
293 element_text <- runX element_text_filter
294 return $ case element_text of
295 [x] -> Just $ strip x
296 [] -> Nothing
297 _ -> error "Found more than one article title."
298
299
300
301
302 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
303 fp_parse_article_body xml = do
304 -- First, delete the article title and byline.
305 let clean_xml' = xml >>> remove_title >>> remove_byline
306 -- The only child of the body element should be a div.lwn-article
307 -- since we wrapped the article's HTML in that.
308 let clean_xml = clean_xml' >>> css "body" >>> getChildren
309 clean_html <- runX . xshow $ clean_xml
310 return $ case clean_html of
311 [x] -> Just x
312 [] -> Nothing
313 _ -> error "Found more than one article body."
314
315 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
316 fp_parse_article xml = do
317 parsed_article_title <- fp_parse_article_title xml
318 parsed_article_byline <- parse_byline xml
319 parsed_article_body <- fp_parse_article_body xml
320
321 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
322 then
323 return Nothing
324 else do
325 let title' = Title $ fromJust parsed_article_title
326 let byline' = Byline parsed_article_byline
327 let body' = BodyHtml $ fromJust parsed_article_body
328 return $ Just $ Article title' byline' body'
329
330 parse_html_article :: String -> IO (Maybe Article)
331 parse_html_article html = do
332 let xml = parseHtml $ wrap_in_body_div html
333 fp_parse_article xml
334
335
336 -- | In the full page, all of the article titles and bodies are
337 -- wrapped in one big div.ArticleText.
338 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
339 parse_bodies xml =
340 xml >>> css "div.ArticleText"
341
342
343 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
344 fp_parse_articles xml = do
345 bodies <- runX . xshow $ parse_bodies xml
346 let article_separator = "<h2 class=\"SummaryHL\">"
347 let split_articles'' = split article_separator (concat bodies)
348 -- The first element will contain the crap before the first <h2...>.
349 let split_articles' = case split_articles'' of
350 (_:_) -> tail split_articles''
351 [] -> []
352 -- Put the separator back, it was lost during the split.
353 let split_articles = map (article_separator ++) split_articles'
354 real_articles <- mapM parse_html_article split_articles
355 let just_articles = catMaybes real_articles
356 return just_articles
357
358
359 -- | This makes it easy to select otherwise-random chunks of html
360 -- using 'css'.
361 wrap_in_body_div :: String -> String
362 wrap_in_body_div s =
363 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
364
365
366
367
368 --
369 -- Epublishable stuff
370 --
371
372 title :: Page -> String
373 title (ArticlePage a) = getTitle $ LWN.Article.title a
374 title (FullPage hl _) = hl
375
376
377 metadata :: Page -> IO String
378 metadata obj = do
379 date <- getCurrentTime
380 return $
381 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
382 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
383 "<dc:language>en-US</dc:language>\n" ++
384 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
385 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
386
387
388 epublish :: Page -> Handle -> IO ()
389 epublish obj handle = do
390 let xhtml = to_xhtml obj
391 epmd <- metadata obj
392 epub <- xhtml_to_epub epmd xhtml
393 B.hPut handle epub
394 hFlush handle
395 hClose handle
396
397 xhtml_to_epub :: String -> String -> IO B.ByteString
398 xhtml_to_epub epmd =
399 write_epub . read_html
400 where
401 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
402 write_epub = writeEPUB Nothing [] my_writer_options
403 read_html = readHtml defaultParserState
404
405
406 --
407 -- Tests
408 --
409
410 test_preprocess_links :: Assertion
411 test_preprocess_links = do
412 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
413 let actual_xml = actual_xml' !! 0
414
415 expected_xml' <- runX $ expected_xml'' >>> css "body"
416 let expected_xml = expected_xml' !! 0
417
418 assertEqual
419 "Links are replaced with spans"
420 expected_xml
421 actual_xml
422 where
423 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
424 input_xml = parseHtml input_html
425 expected_html = "<body><span>Hello, world!</span></body>"
426 expected_xml'' = parseHtml expected_html
427
428
429 test_absolve_images :: Assertion
430 test_absolve_images = do
431 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
432 let actual_xml = actual_xml' !! 0
433
434 expected_xml' <- runX $ expected_xml'' >>> css "body"
435 let expected_xml = expected_xml' !! 0
436
437 assertEqual
438 "Image srcs are made absolute"
439 expected_xml
440 actual_xml
441 where
442 input_html =
443 "<body>" ++
444 "<img src=\"/images/2012/example.jpg\" />" ++
445 "</body>"
446 input_xml = parseHtml input_html
447 expected_html =
448 "<body>" ++
449 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
450 "</body>"
451 expected_xml'' = parseHtml expected_html
452
453
454 test_comments_removed :: Assertion
455 test_comments_removed = do
456 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
457 let actual_xml = actual_xml' !! 0
458
459 expected_xml' <- runX $ expected_xml'' >>> css "body"
460 let expected_xml = expected_xml' !! 0
461
462 assertEqual
463 "Comment links are removed"
464 expected_xml
465 actual_xml
466 where
467 input_html =
468 "<body><p>" ++
469 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
470 "</p></body>"
471 input_xml = parseHtml input_html
472
473 expected_html = "<body><p></p></body>"
474 expected_xml'' = parseHtml expected_html
475
476
477 test_full_story_urls_parsed :: Assertion
478 test_full_story_urls_parsed = do
479 actual <- runX $ actual'
480
481 assertEqual
482 "Full Story URLs are parsed"
483 expected
484 actual
485 where
486 expected = ["/Articles/500738/", "/Articles/501837/"]
487
488 full_story_html =
489 concat ["<p>",
490 "<a href=\"/Articles/500738/\">Full Story</a> ",
491 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
492 "<p>",
493 "<a href=\"/Articles/501837/\">Full Story</a> ",
494 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
495 "<p>"]
496
497 xml = parseHtml full_story_html
498 actual' = xml >>> full_story_urls
499
500 page_tests :: Test
501 page_tests =
502 testGroup "Page Tests" [
503 testCase "Links are replaced with spans" test_preprocess_links,
504 testCase "Image srcs are made absolute" test_absolve_images,
505 testCase "Comment links are removed" test_comments_removed,
506 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]