]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
2bbe21ae8e918ed49d658cac3fe05b0aa4905843
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import qualified Data.Map as Map
7 import Data.Time (getCurrentTime)
8 import System.IO (Handle, hClose, hFlush)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import Data.Maybe (catMaybes, fromJust, isNothing)
12 import Test.HUnit (Assertion, assertEqual)
13 import Test.Framework (Test, testGroup)
14 import Test.Framework.Providers.HUnit (testCase)
15 import Text.Pandoc
16 import Text.XML.HXT.Core (
17 ArrowXml,
18 IOSArrow,
19 XmlTree,
20 (>>>),
21 (/>),
22 (//>),
23 changeAttrValue,
24 getAttrValue,
25 getChildren,
26 getText,
27 hasAttrValue,
28 hasName,
29 isElem,
30 mkName,
31 none,
32 processAttrl,
33 processTopDown,
34 runX,
35 setElemName,
36 xshow,
37 when
38 )
39 import Text.HandsomeSoup (css, parseHtml)
40
41 import LWN.Article
42 import LWN.HTTP (save_image)
43 import LWN.URI (URL, try_make_absolute_url)
44 import Misc (contains)
45 import XHTML
46
47 -- Map absolute image URLs to local system file paths where the image
48 -- referenced by the URL is stored.
49 type ImageMap = Map.Map URL FilePath
50
51 download_image_urls :: [URL] -> IO ImageMap
52 download_image_urls image_urls = do
53 files <- mapM save_image image_urls
54 let pairs = zip image_urls files
55 return $ foldl my_insert empty_map pairs
56 where
57 empty_map = Map.empty :: ImageMap
58
59 my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap
60 my_insert dict (_, Nothing) = dict
61 my_insert dict (k, Just v) = Map.insert k v dict
62
63 -- Should be called *after* preprocessing.
64 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
65 download_images xml = do
66 image_urls <- runX $ xml >>> image_srcs
67 download_image_urls image_urls
68
69
70 data Page =
71 -- | An LWN page with one article on it.
72 ArticlePage { article :: Article } |
73
74 -- | An LWN page with more than one article on it. These require
75 -- different parsing and display functions than the single-article
76 -- pages.
77 FullPage { headline :: String,
78 articles :: [Article] }
79
80
81 instance XHTML Page where
82 to_xhtml (ArticlePage a) =
83 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
84 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
85 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
86 "<head>" ++
87 " <meta http-equiv=\"Content-Type\"" ++
88 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
89 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
90 "</head>" ++
91 "<body>" ++
92 "<div>" ++
93 (to_xhtml a) ++
94 "</div>" ++
95 "</body>" ++
96 "</html>"
97
98 to_xhtml (FullPage hl as) =
99 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
100 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
101 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
102 "<head>" ++
103 " <meta http-equiv=\"Content-Type\"" ++
104 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
105 " <title>" ++ hl ++ "</title>" ++
106 "</head>" ++
107 "<body>" ++
108 "<div>" ++
109 "<h1>" ++ hl ++ "</h1>" ++
110 (concatMap to_xhtml as) ++
111 "</div>" ++
112 "</body>" ++
113 "</html>"
114
115
116
117 is_link :: (ArrowXml a) => a XmlTree XmlTree
118 is_link =
119 isElem >>> hasName "a"
120
121
122 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
123 remove_comment_links =
124 processTopDown $ kill_comments `when` is_link
125 where
126 is_comment_link =
127 hasAttrValue "href" (contains "#Comments")
128
129 kill_comments =
130 none `when` is_comment_link
131
132 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
133 replace_links_with_spans =
134 processTopDown $ (make_span >>> remove_attrs) `when` is_link
135 where
136 make_span = setElemName $ mkName "span"
137 remove_attrs = processAttrl none
138
139
140 -- | Preprocessing common to both page types.
141 preprocess :: (ArrowXml a) => a XmlTree XmlTree
142 preprocess =
143 make_image_srcs_absolute
144 >>>
145 remove_comment_links
146 >>>
147 replace_links_with_spans
148
149
150 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
151 replace_remote_img_srcs image_map =
152 processTopDown (make_srcs_local `when` is_image)
153 where
154 -- old_src -> new_src
155 change_src_func :: String -> String
156 change_src_func old_src =
157 case Map.lookup old_src image_map of
158 -- Leave it alone if we don't have the file locally
159 Nothing -> old_src
160 Just v -> v
161
162 change_src :: (ArrowXml a) => a XmlTree XmlTree
163 change_src =
164 changeAttrValue change_src_func
165
166 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
167 make_srcs_local =
168 processAttrl $ (change_src `when` (hasName "src"))
169
170
171 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
172 parse xml = do
173 let clean_xml = xml >>> preprocess
174 image_map <- download_images clean_xml
175 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
176 appr <- ap_parse local_xml
177 fppr <- fp_parse local_xml
178 return $
179 if (isNothing appr) then
180 fppr
181 else
182 appr
183
184
185
186 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
187 parse_headline xml = do
188 let element_filter = xml >>> css "div.PageHeadline h1"
189 let element_text_filter = element_filter /> getText
190 element_text <- runX element_text_filter
191 return $
192 case element_text of
193 [x] -> Just $ strip x
194 [] -> Nothing
195 _ -> error "Found more than one headline."
196
197
198 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
199 parse_byline xml = do
200 let element_filter = xml >>> css "div.FeatureByLine"
201 let element_text_filter = element_filter /> getText
202 element_text <- runX element_text_filter
203 return $
204 case element_text of
205 [x] -> Just $ strip x
206 [] -> Nothing
207 _ -> error "Found more than one article byline."
208
209
210 --
211 -- ArticlePage Stuff
212 --
213 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
214 ap_parse xml = do
215 arts <- ap_parse_articles xml
216 case arts of
217 [x] -> return $ Just $ ArticlePage x
218 _ -> return Nothing
219
220
221 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
222 ap_parse_body xml = do
223 let element_filter = xml >>> css "div.ArticleText"
224 let element_html_filter = xshow element_filter
225 element_html <- runX element_html_filter
226 return $ case element_html of
227 [x] -> Just x
228 [] -> Nothing
229 _ -> error "Found more than one article."
230
231
232 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
233 ap_parse_articles xml = do
234 parsed_headline <- parse_headline xml
235 parsed_byline <- parse_byline xml
236 parsed_body <- ap_parse_body xml
237
238 putStrLn $ fromJust parsed_headline
239
240 if (isNothing parsed_headline) || (isNothing parsed_body)
241 then return []
242 else do
243 let title' = Title $ fromJust parsed_headline
244 let byline' = Byline parsed_byline
245 let body' = BodyHtml $ fromJust parsed_body
246
247 return $ [Article title' byline' body']
248
249
250
251 --
252 -- FullPage Stuff
253 --
254
255 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
256 fp_parse xml = do
257 hl <- parse_headline xml
258 parsed_articles <- fp_parse_articles xml
259 case parsed_articles of
260 [] -> return Nothing
261 x -> return $ Just $ FullPage (fromJust hl) x
262
263
264
265 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
266 fp_parse_article_title xml = do
267 let element_filter = xml >>> css "h2.SummaryHL"
268 let element_text_filter = element_filter //> getText
269 element_text <- runX element_text_filter
270 return $ case element_text of
271 [x] -> Just $ strip x
272 [] -> Nothing
273 _ -> error "Found more than one article title."
274
275
276
277 is_title :: (ArrowXml a) => a XmlTree XmlTree
278 is_title =
279 (hasName "h2")
280 >>>
281 (hasAttrValue "class" (== "SummaryHL"))
282
283
284 is_byline :: (ArrowXml a) => a XmlTree XmlTree
285 is_byline =
286 (hasName "div")
287 >>>
288 (hasAttrValue "class" (== "FeatureByLine"))
289
290
291 is_image :: (ArrowXml a) => a XmlTree XmlTree
292 is_image = isElem >>> hasName "img"
293
294 remove_title :: (ArrowXml a) => a XmlTree XmlTree
295 remove_title =
296 processTopDown ((none) `when` is_title)
297
298
299 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
300 remove_byline =
301 processTopDown ((none) `when` is_byline)
302
303
304
305 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
306 fp_parse_article_body xml = do
307 -- First, delete the article title and byline.
308 let clean_xml' = xml >>> remove_title >>> remove_byline
309 -- The only child of the body element should be a div.lwn-article
310 -- since we wrapped the article's HTML in that.
311 let clean_xml = clean_xml' >>> css "body" >>> getChildren
312 clean_html <- runX . xshow $ clean_xml
313 return $ case clean_html of
314 [x] -> Just x
315 [] -> Nothing
316 _ -> error "Found more than one article body."
317
318 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
319 fp_parse_article xml = do
320 parsed_article_title <- fp_parse_article_title xml
321 parsed_article_byline <- parse_byline xml
322 parsed_article_body <- fp_parse_article_body xml
323
324 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
325 then
326 return Nothing
327 else do
328 let title' = Title $ fromJust parsed_article_title
329 let byline' = Byline parsed_article_byline
330 let body' = BodyHtml $ fromJust parsed_article_body
331 return $ Just $ Article title' byline' body'
332
333 parse_html_article :: String -> IO (Maybe Article)
334 parse_html_article html = do
335 let xml = parseHtml $ wrap_in_body_div html
336 fp_parse_article xml
337
338
339 -- | In the full page, all of the article titles and bodies are
340 -- wrapped in one big div.ArticleText.
341 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
342 parse_bodies xml =
343 xml >>> css "div.ArticleText"
344
345
346 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
347 fp_parse_articles xml = do
348 bodies <- runX . xshow $ parse_bodies xml
349 let article_separator = "<h2 class=\"SummaryHL\">"
350 let split_articles'' = split article_separator (concat bodies)
351 -- The first element will contain the crap before the first <h2...>.
352 let split_articles' = case split_articles'' of
353 (_:_) -> tail split_articles''
354 [] -> []
355 -- Put the separator back, it was lost during the split.
356 let split_articles = map (article_separator ++) split_articles'
357 --_ <- mapM print_article split_articles
358 real_articles <- mapM parse_html_article split_articles
359 let just_articles = catMaybes real_articles
360 return just_articles
361
362
363 -- | This makes it easy to select otherwise-random chunks of html
364 -- using 'css'.
365 wrap_in_body_div :: String -> String
366 wrap_in_body_div s =
367 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
368
369
370
371
372 --
373 -- Epublishable stuff
374 --
375
376 title :: Page -> String
377 title (ArticlePage a) = getTitle $ LWN.Article.title a
378 title (FullPage hl _) = hl
379
380
381 metadata :: Page -> IO String
382 metadata obj = do
383 date <- getCurrentTime
384 return $
385 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
386 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
387 "<dc:language>en-US</dc:language>\n" ++
388 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
389 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
390
391
392 epublish :: Page -> Handle -> IO ()
393 epublish obj handle = do
394 let xhtml = to_xhtml obj
395 epmd <- metadata obj
396 epub <- xhtml_to_epub epmd xhtml
397 B.hPut handle epub
398 hFlush handle
399 hClose handle
400
401 xhtml_to_epub :: String -> String -> IO B.ByteString
402 xhtml_to_epub epmd =
403 write_epub . read_html
404 where
405 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
406 write_epub = writeEPUB Nothing [] my_writer_options
407 read_html = readHtml defaultParserState
408
409
410
411 --
412 -- Misc
413 --
414
415 image_srcs :: (ArrowXml a) => a XmlTree URL
416 image_srcs =
417 css "img"
418 >>>
419 getAttrValue "src"
420
421 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
422 make_image_srcs_absolute =
423 processTopDown (make_srcs_absolute `when` is_image)
424 where
425 change_src :: (ArrowXml a) => a XmlTree XmlTree
426 change_src =
427 changeAttrValue try_make_absolute_url
428
429 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
430 make_srcs_absolute =
431 processAttrl $ change_src `when` hasName "src"
432
433
434 --
435 -- Tests
436 --
437
438 test_preprocess_links :: Assertion
439 test_preprocess_links = do
440 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
441 let actual_xml = actual_xml' !! 0
442
443 expected_xml' <- runX $ expected_xml'' >>> css "body"
444 let expected_xml = expected_xml' !! 0
445
446 assertEqual
447 "Links are replaced with spans"
448 expected_xml
449 actual_xml
450 where
451 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
452 input_xml = parseHtml input_html
453 expected_html = "<body><span>Hello, world!</span></body>"
454 expected_xml'' = parseHtml expected_html
455
456
457 test_absolve_images :: Assertion
458 test_absolve_images = do
459 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
460 let actual_xml = actual_xml' !! 0
461
462 expected_xml' <- runX $ expected_xml'' >>> css "body"
463 let expected_xml = expected_xml' !! 0
464
465 assertEqual
466 "Image srcs are made absolute"
467 expected_xml
468 actual_xml
469 where
470 input_html =
471 "<body>" ++
472 "<img src=\"/images/2012/example.jpg\" />" ++
473 "</body>"
474 input_xml = parseHtml input_html
475 expected_html =
476 "<body>" ++
477 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
478 "</body>"
479 expected_xml'' = parseHtml expected_html
480
481
482 test_comments_removed :: Assertion
483 test_comments_removed = do
484 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
485 let actual_xml = actual_xml' !! 0
486
487 expected_xml' <- runX $ expected_xml'' >>> css "body"
488 let expected_xml = expected_xml' !! 0
489
490 assertEqual
491 "Comment links are removed"
492 expected_xml
493 actual_xml
494 where
495 input_html =
496 "<body><p>" ++
497 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
498 "</p></body>"
499 input_xml = parseHtml input_html
500
501 expected_html = "<body><p></p></body>"
502 expected_xml'' = parseHtml expected_html
503
504
505
506 page_tests :: Test
507 page_tests =
508 testGroup "Page Tests" [
509 testCase "Links are replaced with spans" test_preprocess_links,
510 testCase "Image srcs are made absolute" test_absolve_images,
511 testCase "Comment links are removed" test_comments_removed ]