]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
Final cleanups to get the download working.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import qualified Data.Map as Map
7 import Data.Time (getCurrentTime)
8 import System.IO (Handle, hClose, hFlush)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import Data.Maybe (catMaybes, fromJust, isNothing)
12 import Data.Tree.NTree.TypeDefs (NTree)
13 import Test.HUnit (Assertion, assertEqual)
14 import Test.Framework (Test, testGroup)
15 import Test.Framework.Providers.HUnit (testCase)
16 import Text.Pandoc
17 import Text.XML.HXT.Core (
18 ArrowXml,
19 IOSArrow,
20 XmlTree,
21 XNode,
22 (>>>),
23 (/>),
24 (//>),
25 changeAttrValue,
26 getAttrValue,
27 getChildren,
28 getText,
29 hasAttrValue,
30 hasName,
31 isElem,
32 mkName,
33 none,
34 processAttrl,
35 processTopDown,
36 runX,
37 setElemName,
38 xshow,
39 when
40 )
41 import Text.HandsomeSoup (css, parseHtml)
42
43 import LWN.Article
44 import LWN.HTTP (save_image)
45 import LWN.URI (URL, try_make_absolute_url)
46 import Misc (contains)
47 import XHTML
48
49 -- Map absolute image URLs to local system file paths where the image
50 -- referenced by the URL is stored.
51 type ImageMap = Map.Map URL FilePath
52
53 -- Should be called *after* preprocessing.
54 download_images :: IOSArrow XmlTree (NTree XNode) -> IO ImageMap
55 download_images xml = do
56 image_urls <- runX $ xml >>> image_srcs
57 files <- mapM save_image image_urls
58 let pairs = zip image_urls files
59 return $ foldl my_insert empty_map pairs
60 where
61 empty_map = Map.empty :: ImageMap
62
63 my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap
64 my_insert dict (_, Nothing) = dict
65 my_insert dict (k, Just v) = Map.insert k v dict
66
67
68 data Page =
69 -- | An LWN page with one article on it.
70 ArticlePage { article :: Article } |
71
72 -- | An LWN page with more than one article on it. These require
73 -- different parsing and display functions than the single-article
74 -- pages.
75 FullPage { headline :: String,
76 articles :: [Article] }
77
78
79 instance XHTML Page where
80 to_xhtml (ArticlePage a) =
81 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
82 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
83 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
84 "<head>" ++
85 " <meta http-equiv=\"Content-Type\"" ++
86 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
87 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
88 "</head>" ++
89 "<body>" ++
90 "<div>" ++
91 (to_xhtml a) ++
92 "</div>" ++
93 "</body>" ++
94 "</html>"
95
96 to_xhtml (FullPage hl as) =
97 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
98 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
99 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
100 "<head>" ++
101 " <meta http-equiv=\"Content-Type\"" ++
102 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
103 " <title>" ++ hl ++ "</title>" ++
104 "</head>" ++
105 "<body>" ++
106 "<div>" ++
107 "<h1>" ++ hl ++ "</h1>" ++
108 (concatMap to_xhtml as) ++
109 "</div>" ++
110 "</body>" ++
111 "</html>"
112
113
114
115 is_link :: (ArrowXml a) => a XmlTree XmlTree
116 is_link =
117 isElem >>> hasName "a"
118
119
120 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
121 remove_comment_links =
122 processTopDown $ kill_comments `when` is_link
123 where
124 is_comment_link =
125 hasAttrValue "href" (contains "#Comments")
126
127 kill_comments =
128 none `when` is_comment_link
129
130 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
131 replace_links_with_spans =
132 processTopDown $ (make_span >>> remove_attrs) `when` is_link
133 where
134 make_span = setElemName $ mkName "span"
135 remove_attrs = processAttrl none
136
137
138 -- | Preprocessing common to both page types.
139 preprocess :: (ArrowXml a) => a XmlTree XmlTree
140 preprocess =
141 make_image_srcs_absolute
142 >>>
143 remove_comment_links
144 >>>
145 replace_links_with_spans
146
147
148 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
149 replace_remote_img_srcs image_map =
150 processTopDown (make_srcs_local `when` is_image)
151 where
152 -- old_src -> new_src
153 change_src_func :: String -> String
154 change_src_func old_src =
155 case Map.lookup old_src image_map of
156 -- Leave it alone if we don't have the file locally
157 Nothing -> old_src
158 Just v -> v
159
160 change_src :: (ArrowXml a) => a XmlTree XmlTree
161 change_src =
162 changeAttrValue change_src_func
163
164 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
165 make_srcs_local =
166 processAttrl $ (change_src `when` (hasName "src"))
167
168
169 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
170 parse xml = do
171 let clean_xml = xml >>> preprocess
172 image_map <- download_images clean_xml
173 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
174 appr <- ap_parse local_xml
175 fppr <- fp_parse local_xml
176 return $
177 if (isNothing appr) then
178 fppr
179 else
180 appr
181
182
183
184 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
185 parse_headline xml = do
186 let element_filter = xml >>> css "div.PageHeadline h1"
187 let element_text_filter = element_filter /> getText
188 element_text <- runX element_text_filter
189 return $
190 case element_text of
191 [x] -> Just $ strip x
192 [] -> Nothing
193 _ -> error "Found more than one headline."
194
195
196 parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
197 parse_byline xml = do
198 let element_filter = xml >>> css "div.FeatureByLine"
199 let element_text_filter = element_filter /> getText
200 element_text <- runX element_text_filter
201 return $
202 case element_text of
203 [x] -> Just $ strip x
204 [] -> Nothing
205 _ -> error "Found more than one article byline."
206
207
208 --
209 -- ArticlePage Stuff
210 --
211 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
212 ap_parse xml = do
213 arts <- ap_parse_articles xml
214 case arts of
215 [x] -> return $ Just $ ArticlePage x
216 _ -> return Nothing
217
218
219 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
220 ap_parse_body xml = do
221 let element_filter = xml >>> css "div.ArticleText"
222 let element_html_filter = xshow element_filter
223 element_html <- runX element_html_filter
224 return $ case element_html of
225 [x] -> Just x
226 [] -> Nothing
227 _ -> error "Found more than one article."
228
229
230 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
231 ap_parse_articles xml = do
232 parsed_headline <- parse_headline xml
233 parsed_byline <- parse_byline xml
234 parsed_body <- ap_parse_body xml
235
236 putStrLn $ fromJust parsed_headline
237
238 if (isNothing parsed_headline) || (isNothing parsed_body)
239 then return []
240 else do
241 let title' = Title $ fromJust parsed_headline
242 let byline' = Byline parsed_byline
243 let body' = BodyHtml $ fromJust parsed_body
244
245 return $ [Article title' byline' body']
246
247
248
249 --
250 -- FullPage Stuff
251 --
252
253 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
254 fp_parse xml = do
255 hl <- parse_headline xml
256 parsed_articles <- fp_parse_articles xml
257 case parsed_articles of
258 [] -> return Nothing
259 x -> return $ Just $ FullPage (fromJust hl) x
260
261
262
263 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
264 fp_parse_article_title xml = do
265 let element_filter = xml >>> css "h2.SummaryHL"
266 let element_text_filter = element_filter //> getText
267 element_text <- runX element_text_filter
268 return $ case element_text of
269 [x] -> Just $ strip x
270 [] -> Nothing
271 _ -> error "Found more than one article title."
272
273
274
275 is_title :: (ArrowXml a) => a XmlTree XmlTree
276 is_title =
277 (hasName "h2")
278 >>>
279 (hasAttrValue "class" (== "SummaryHL"))
280
281
282 is_byline :: (ArrowXml a) => a XmlTree XmlTree
283 is_byline =
284 (hasName "div")
285 >>>
286 (hasAttrValue "class" (== "FeatureByLine"))
287
288
289 is_image :: (ArrowXml a) => a XmlTree XmlTree
290 is_image = isElem >>> hasName "img"
291
292 remove_title :: (ArrowXml a) => a XmlTree XmlTree
293 remove_title =
294 processTopDown ((none) `when` is_title)
295
296
297 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
298 remove_byline =
299 processTopDown ((none) `when` is_byline)
300
301
302
303 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
304 fp_parse_article_body xml = do
305 -- First, delete the article title and byline.
306 let clean_xml' = xml >>> remove_title >>> remove_byline
307 -- The only child of the body element should be a div.lwn-article
308 -- since we wrapped the article's HTML in that.
309 let clean_xml = clean_xml' >>> css "body" >>> getChildren
310 clean_html <- runX . xshow $ clean_xml
311 return $ case clean_html of
312 [x] -> Just x
313 [] -> Nothing
314 _ -> error "Found more than one article body."
315
316 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
317 fp_parse_article xml = do
318 parsed_article_title <- fp_parse_article_title xml
319 parsed_article_byline <- parse_byline xml
320 parsed_article_body <- fp_parse_article_body xml
321
322 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
323 then
324 return Nothing
325 else do
326 let title' = Title $ fromJust parsed_article_title
327 let byline' = Byline parsed_article_byline
328 let body' = BodyHtml $ fromJust parsed_article_body
329 return $ Just $ Article title' byline' body'
330
331 parse_html_article :: String -> IO (Maybe Article)
332 parse_html_article html = do
333 let xml = parseHtml $ wrap_in_body_div html
334 fp_parse_article xml
335
336
337 -- | In the full page, all of the article titles and bodies are
338 -- wrapped in one big div.ArticleText.
339 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
340 parse_bodies xml =
341 xml >>> css "div.ArticleText"
342
343
344 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
345 fp_parse_articles xml = do
346 bodies <- runX . xshow $ parse_bodies xml
347 let article_separator = "<h2 class=\"SummaryHL\">"
348 let split_articles'' = split article_separator (concat bodies)
349 -- The first element will contain the crap before the first <h2...>.
350 let split_articles' = case split_articles'' of
351 (_:_) -> tail split_articles''
352 [] -> []
353 -- Put the separator back, it was lost during the split.
354 let split_articles = map (article_separator ++) split_articles'
355 --_ <- mapM print_article split_articles
356 real_articles <- mapM parse_html_article split_articles
357 let just_articles = catMaybes real_articles
358 return just_articles
359
360
361 -- | This makes it easy to select otherwise-random chunks of html
362 -- using 'css'.
363 wrap_in_body_div :: String -> String
364 wrap_in_body_div s =
365 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
366
367
368
369
370 --
371 -- Epublishable stuff
372 --
373
374 title :: Page -> String
375 title (ArticlePage a) = getTitle $ LWN.Article.title a
376 title (FullPage hl _) = hl
377
378
379 metadata :: Page -> IO String
380 metadata obj = do
381 date <- getCurrentTime
382 return $
383 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
384 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
385 "<dc:language>en-US</dc:language>\n" ++
386 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
387 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
388
389
390 epublish :: Page -> Handle -> IO ()
391 epublish obj handle = do
392 let xhtml = to_xhtml obj
393 epmd <- metadata obj
394 epub <- xhtml_to_epub epmd xhtml
395 B.hPut handle epub
396 hFlush handle
397 hClose handle
398
399 xhtml_to_epub :: String -> String -> IO B.ByteString
400 xhtml_to_epub epmd =
401 write_epub . read_html
402 where
403 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
404 write_epub = writeEPUB Nothing [] my_writer_options
405 read_html = readHtml defaultParserState
406
407
408
409 --
410 -- Misc
411 --
412
413 image_srcs :: (ArrowXml a) => a XmlTree URL
414 image_srcs =
415 css "img"
416 >>>
417 getAttrValue "src"
418
419 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
420 make_image_srcs_absolute =
421 processTopDown (make_srcs_absolute `when` is_image)
422 where
423 change_src :: (ArrowXml a) => a XmlTree XmlTree
424 change_src =
425 changeAttrValue try_make_absolute_url
426
427 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
428 make_srcs_absolute =
429 processAttrl $ change_src `when` hasName "src"
430
431
432 --
433 -- Tests
434 --
435
436 test_preprocess_links :: Assertion
437 test_preprocess_links = do
438 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
439 let actual_xml = actual_xml' !! 0
440
441 expected_xml' <- runX $ expected_xml'' >>> css "body"
442 let expected_xml = expected_xml' !! 0
443
444 assertEqual
445 "Links are replaced with spans"
446 expected_xml
447 actual_xml
448 where
449 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
450 input_xml = parseHtml input_html
451 expected_html = "<body><span>Hello, world!</span></body>"
452 expected_xml'' = parseHtml expected_html
453
454
455 test_absolve_images :: Assertion
456 test_absolve_images = do
457 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
458 let actual_xml = actual_xml' !! 0
459
460 expected_xml' <- runX $ expected_xml'' >>> css "body"
461 let expected_xml = expected_xml' !! 0
462
463 assertEqual
464 "Image srcs are made absolute"
465 expected_xml
466 actual_xml
467 where
468 input_html =
469 "<body>" ++
470 "<img src=\"/images/2012/example.jpg\" />" ++
471 "</body>"
472 input_xml = parseHtml input_html
473 expected_html =
474 "<body>" ++
475 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
476 "</body>"
477 expected_xml'' = parseHtml expected_html
478
479
480 test_comments_removed :: Assertion
481 test_comments_removed = do
482 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
483 let actual_xml = actual_xml' !! 0
484
485 expected_xml' <- runX $ expected_xml'' >>> css "body"
486 let expected_xml = expected_xml' !! 0
487
488 assertEqual
489 "Comment links are removed"
490 expected_xml
491 actual_xml
492 where
493 input_html =
494 "<body><p>" ++
495 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
496 "</p></body>"
497 input_xml = parseHtml input_html
498
499 expected_html = "<body><p></p></body>"
500 expected_xml'' = parseHtml expected_html
501
502
503
504 page_tests :: Test
505 page_tests =
506 testGroup "Page Tests" [
507 testCase "Links are replaced with spans" test_preprocess_links,
508 testCase "Image srcs are made absolute" test_absolve_images,
509 testCase "Comment links are removed" test_comments_removed ]