]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
0307214176a82c469f1f555a056b553c05bd3b06
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import qualified Data.Map as Map
7 import Data.Time (getCurrentTime)
8 import System.IO (Handle)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.List (isInfixOf)
11 import Data.String.Utils (split, strip)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Data.Tree.NTree.TypeDefs (NTree)
14 import Test.HUnit (Assertion, assertEqual)
15 import Test.Framework (Test, testGroup)
16 import Test.Framework.Providers.HUnit (testCase)
17 import Text.Pandoc
18 import Text.XML.HXT.Core (
19 ArrowXml,
20 IOSArrow,
21 XmlTree,
22 XNode,
23 (>>>),
24 (/>),
25 (//>),
26 changeAttrValue,
27 getAttrValue,
28 getChildren,
29 getText,
30 hasAttrValue,
31 hasName,
32 isElem,
33 mkName,
34 none,
35 processAttrl,
36 processTopDown,
37 runX,
38 setElemName,
39 xshow,
40 when
41 )
42 import Text.HandsomeSoup (css, parseHtml)
43
44 import LWN.Article
45 import LWN.HTTP (save_image)
46 import LWN.URI (URL, try_make_absolute_url)
47 import XHTML
48
49 -- Map absolute image URLs to local system file paths where the image
50 -- referenced by the URL is stored.
51 type ImageMap = Map.Map URL FilePath
52
53 -- Should be called *after* preprocessing.
54 download_images :: IOSArrow XmlTree (NTree XNode) -> IO ImageMap
55 download_images xml = do
56 image_urls <- runX $ xml >>> image_srcs
57 files <- mapM save_image image_urls
58 let pairs = zip image_urls files
59 return $ foldl my_insert empty_map pairs
60 where
61 empty_map = Map.empty :: ImageMap
62
63 my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap
64 my_insert dict (_, Nothing) = dict
65 my_insert dict (k, Just v) = Map.insert k v dict
66
67
68 data Page =
69 -- | An LWN page with one article on it.
70 ArticlePage { article :: Article } |
71
72 -- | An LWN page with more than one article on it. These require
73 -- different parsing and display functions than the single-article
74 -- pages.
75 FullPage { headline :: String,
76 articles :: [Article] }
77
78
79 instance XHTML Page where
80 to_xhtml (ArticlePage a) =
81 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
82 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
83 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
84 "<head>" ++
85 " <meta http-equiv=\"Content-Type\"" ++
86 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
87 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
88 "</head>" ++
89 "<body>" ++
90 "<div>" ++
91 (to_xhtml a) ++
92 "</div>" ++
93 "</body>" ++
94 "</html>"
95
96 to_xhtml (FullPage hl as) =
97 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
98 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
99 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
100 "<head>" ++
101 " <meta http-equiv=\"Content-Type\"" ++
102 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
103 " <title>" ++ hl ++ "</title>" ++
104 "</head>" ++
105 "<body>" ++
106 "<div>" ++
107 "<h1>" ++ hl ++ "</h1>" ++
108 (concatMap to_xhtml as) ++
109 "</div>" ++
110 "</body>" ++
111 "</html>"
112
113
114
115 is_link :: (ArrowXml a) => a XmlTree XmlTree
116 is_link =
117 isElem >>> hasName "a"
118
119
120 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
121 remove_comment_links =
122 processTopDown $ kill_comments `when` is_link
123 where
124 contains = isInfixOf
125
126 is_comment_link =
127 hasAttrValue "href" (contains "#Comments")
128
129 kill_comments =
130 none `when` is_comment_link
131
132 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
133 replace_links_with_spans =
134 processTopDown $ (make_span >>> remove_attrs) `when` is_link
135 where
136 make_span = setElemName $ mkName "span"
137 remove_attrs = processAttrl none
138
139
140 -- | Preprocessing common to both page types.
141 preprocess :: (ArrowXml a) => a XmlTree XmlTree
142 preprocess =
143 make_image_srcs_absolute
144 >>>
145 remove_comment_links
146 >>>
147 replace_links_with_spans
148
149
150 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
151 replace_remote_img_srcs image_map =
152 processTopDown (make_srcs_local `when` is_image)
153 where
154 -- old_src -> new_src
155 change_src_func :: String -> String
156 change_src_func old_src =
157 case Map.lookup old_src image_map of
158 -- Leave it alone if we don't have the file locally
159 Nothing -> old_src
160 Just v -> v
161
162 change_src :: (ArrowXml a) => a XmlTree XmlTree
163 change_src =
164 changeAttrValue change_src_func
165
166 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
167 make_srcs_local =
168 processAttrl $ (change_src `when` (hasName "src"))
169
170
171 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
172 parse xml = do
173 let clean_xml = xml >>> preprocess
174 image_map <- download_images clean_xml
175 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
176 appr <- ap_parse local_xml
177 fppr <- fp_parse local_xml
178 return $
179 if (isNothing appr) then
180 fppr
181 else
182 appr
183
184
185
186 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
187 parse_headline xml = do
188 let element_filter = xml >>> css "div.PageHeadline h1"
189 let element_text_filter = element_filter /> getText
190 element_text <- runX element_text_filter
191 return $
192 case element_text of
193 [x] -> Just $ strip x
194 [] -> Nothing
195 _ -> error "Found more than one headline."
196
197
198 parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
199 parse_byline xml = do
200 let element_filter = xml >>> css "div.FeatureByLine"
201 let element_text_filter = element_filter /> getText
202 element_text <- runX element_text_filter
203 return $
204 case element_text of
205 [x] -> Just $ strip x
206 [] -> Nothing
207 _ -> error "Found more than one article byline."
208
209
210 --
211 -- ArticlePage Stuff
212 --
213 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
214 ap_parse xml = do
215 arts <- ap_parse_articles xml
216 case arts of
217 [x] -> return $ Just $ ArticlePage x
218 _ -> return Nothing
219
220
221 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
222 ap_parse_body xml = do
223 let element_filter = xml >>> css "div.ArticleText"
224 let element_html_filter = xshow element_filter
225 element_html <- runX element_html_filter
226 return $ case element_html of
227 [x] -> Just x
228 [] -> Nothing
229 _ -> error "Found more than one article."
230
231
232 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
233 ap_parse_articles xml = do
234 parsed_headline <- parse_headline xml
235 parsed_byline <- parse_byline xml
236 parsed_body <- ap_parse_body xml
237
238 putStrLn $ fromJust parsed_headline
239
240 if (isNothing parsed_headline) || (isNothing parsed_body)
241 then return []
242 else do
243 let title' = Title $ fromJust parsed_headline
244 let byline' = Byline parsed_byline
245 let body' = BodyHtml $ fromJust parsed_body
246
247 return $ [Article title' byline' body']
248
249
250
251 --
252 -- FullPage Stuff
253 --
254
255 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
256 fp_parse xml = do
257 hl <- parse_headline xml
258 parsed_articles <- fp_parse_articles xml
259 case parsed_articles of
260 [] -> return Nothing
261 x -> return $ Just $ FullPage (fromJust hl) x
262
263
264
265 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
266 fp_parse_article_title xml = do
267 let element_filter = xml >>> css "h2.SummaryHL"
268 let element_text_filter = element_filter //> getText
269 element_text <- runX element_text_filter
270 return $ case element_text of
271 [x] -> Just $ strip x
272 [] -> Nothing
273 _ -> error "Found more than one article title."
274
275
276
277 is_title :: (ArrowXml a) => a XmlTree XmlTree
278 is_title =
279 (hasName "h2")
280 >>>
281 (hasAttrValue "class" (== "SummaryHL"))
282
283
284 is_byline :: (ArrowXml a) => a XmlTree XmlTree
285 is_byline =
286 (hasName "div")
287 >>>
288 (hasAttrValue "class" (== "FeatureByLine"))
289
290
291 is_image :: (ArrowXml a) => a XmlTree XmlTree
292 is_image = isElem >>> hasName "img"
293
294 remove_title :: (ArrowXml a) => a XmlTree XmlTree
295 remove_title =
296 processTopDown ((none) `when` is_title)
297
298
299 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
300 remove_byline =
301 processTopDown ((none) `when` is_byline)
302
303
304
305 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
306 fp_parse_article_body xml = do
307 -- First, delete the article title and byline.
308 let clean_xml' = xml >>> remove_title >>> remove_byline
309 -- The only child of the body element should be a div.lwn-article
310 -- since we wrapped the article's HTML in that.
311 let clean_xml = clean_xml' >>> css "body" >>> getChildren
312 clean_html <- runX . xshow $ clean_xml
313 return $ case clean_html of
314 [x] -> Just x
315 [] -> Nothing
316 _ -> error "Found more than one article body."
317
318 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
319 fp_parse_article xml = do
320 parsed_article_title <- fp_parse_article_title xml
321 parsed_article_byline <- parse_byline xml
322 parsed_article_body <- fp_parse_article_body xml
323
324 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
325 then
326 return Nothing
327 else do
328 let title' = Title $ fromJust parsed_article_title
329 let byline' = Byline parsed_article_byline
330 let body' = BodyHtml $ fromJust parsed_article_body
331 return $ Just $ Article title' byline' body'
332
333 parse_html_article :: String -> IO (Maybe Article)
334 parse_html_article html = do
335 let xml = parseHtml $ wrap_in_body_div html
336 fp_parse_article xml
337
338
339 -- | In the full page, all of the article titles and bodies are
340 -- wrapped in one big div.ArticleText.
341 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
342 parse_bodies xml =
343 xml >>> css "div.ArticleText"
344
345
346 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
347 fp_parse_articles xml = do
348 bodies <- runX . xshow $ parse_bodies xml
349 let article_separator = "<h2 class=\"SummaryHL\">"
350 let split_articles'' = split article_separator (concat bodies)
351 -- The first element will contain the crap before the first <h2...>.
352 let split_articles' = case split_articles'' of
353 (_:_) -> tail split_articles''
354 [] -> []
355 -- Put the separator back, it was lost during the split.
356 let split_articles = map (article_separator ++) split_articles'
357 --_ <- mapM print_article split_articles
358 real_articles <- mapM parse_html_article split_articles
359 let just_articles = catMaybes real_articles
360 return just_articles
361
362
363 -- | This makes it easy to select otherwise-random chunks of html
364 -- using 'css'.
365 wrap_in_body_div :: String -> String
366 wrap_in_body_div s =
367 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
368
369
370
371
372 --
373 -- Epublishable stuff
374 --
375
376 title :: Page -> String
377 title (ArticlePage a) = getTitle $ LWN.Article.title a
378 title (FullPage hl _) = hl
379
380
381 metadata :: Page -> IO String
382 metadata obj = do
383 date <- getCurrentTime
384 return $
385 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
386 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
387 "<dc:language>en-US</dc:language>\n" ++
388 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
389 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
390
391
392 epublish :: Page -> Handle -> IO ()
393 epublish obj handle = do
394 let xhtml = to_xhtml obj
395 epmd <- metadata obj
396 epub <- xhtml_to_epub epmd xhtml
397 B.hPut handle epub
398
399
400 xhtml_to_epub :: String -> String -> IO B.ByteString
401 xhtml_to_epub epmd =
402 write_epub . read_html
403 where
404 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
405 write_epub = writeEPUB Nothing [] my_writer_options
406 read_html = readHtml defaultParserState
407
408
409
410 --
411 -- Misc
412 --
413
414 image_srcs :: (ArrowXml a) => a XmlTree URL
415 image_srcs =
416 css "img"
417 >>>
418 getAttrValue "src"
419
420 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
421 make_image_srcs_absolute =
422 processTopDown (make_srcs_absolute `when` is_image)
423 where
424 change_src :: (ArrowXml a) => a XmlTree XmlTree
425 change_src =
426 changeAttrValue try_make_absolute_url
427
428 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
429 make_srcs_absolute =
430 processAttrl $ change_src `when` hasName "src"
431
432
433 --
434 -- Tests
435 --
436
437 test_preprocess_links :: Assertion
438 test_preprocess_links = do
439 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
440 let actual_xml = actual_xml' !! 0
441
442 expected_xml' <- runX $ expected_xml'' >>> css "body"
443 let expected_xml = expected_xml' !! 0
444
445 assertEqual
446 "Links are replaced with spans"
447 expected_xml
448 actual_xml
449 where
450 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
451 input_xml = parseHtml input_html
452 expected_html = "<body><span>Hello, world!</span></body>"
453 expected_xml'' = parseHtml expected_html
454
455
456 test_absolve_images :: Assertion
457 test_absolve_images = do
458 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
459 let actual_xml = actual_xml' !! 0
460
461 expected_xml' <- runX $ expected_xml'' >>> css "body"
462 let expected_xml = expected_xml' !! 0
463
464 assertEqual
465 "Image srcs are made absolute"
466 expected_xml
467 actual_xml
468 where
469 input_html =
470 "<body>" ++
471 "<img src=\"/images/2012/example.jpg\" />" ++
472 "</body>"
473 input_xml = parseHtml input_html
474 expected_html =
475 "<body>" ++
476 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
477 "</body>"
478 expected_xml'' = parseHtml expected_html
479
480
481 test_comments_removed :: Assertion
482 test_comments_removed = do
483 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
484 let actual_xml = actual_xml' !! 0
485
486 expected_xml' <- runX $ expected_xml'' >>> css "body"
487 let expected_xml = expected_xml' !! 0
488
489 assertEqual
490 "Comment links are removed"
491 expected_xml
492 actual_xml
493 where
494 input_html =
495 "<body><p>" ++
496 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
497 "</p></body>"
498 input_xml = parseHtml input_html
499
500 expected_html = "<body><p></p></body>"
501 expected_xml'' = parseHtml expected_html
502
503
504
505 page_tests :: Test
506 page_tests =
507 testGroup "Page Tests" [
508 testCase "Links are replaced with spans" test_preprocess_links,
509 testCase "Image srcs are made absolute" test_absolve_images,
510 testCase "Comment links are removed" test_comments_removed ]