]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
Move the wrapper div inside the ArticlePage to_xhtml function.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import qualified Data.Map as Map (lookup)
7 import Data.Time (getCurrentTime)
8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
9 import Data.String.Utils (split, strip)
10 import Data.Maybe (catMaybes, fromJust, isNothing)
11 import Prelude hiding (readFile)
12 import System.Directory (doesFileExist)
13 import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr)
14 import System.IO.UTF8 (readFile)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
18 import Text.Pandoc (
19 defaultParserState,
20 defaultWriterOptions,
21 readHtml,
22 writeEPUB,
23 writerEPUBMetadata)
24 import Text.XML.HXT.Core (
25 ArrowXml,
26 IOSArrow,
27 IOStateArrow,
28 XmlTree,
29 (>>>),
30 (/>),
31 (//>),
32 changeAttrValue,
33 getAttrValue,
34 getChildren,
35 getText,
36 hasAttrValue,
37 hasName,
38 isElem,
39 mkName,
40 none,
41 processAttrl,
42 processTopDown,
43 runX,
44 setElemName,
45 xshow,
46 when)
47 import Text.HandsomeSoup (css, parseHtml)
48
49 import Configuration (Cfg, password, use_account, username)
50 import LWN.Article
51 import LWN.HTTP (
52 ImageMap,
53 download_image_urls,
54 get_page,
55 log_in,
56 make_cookie_jar)
57 import LWN.URI (URL, try_make_absolute_url)
58 import LWN.XHTML (XHTML, parse_lwn, to_xhtml)
59 import Misc (contains)
60
61
62 -- | Try to parse the given article using HXT. We try a few different
63 -- methods; if none of them work, we return 'Nothing'.
64 get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree))
65 get_xml_from_article cfg article_name = do
66 my_article <- real_article_path article_name
67 is_file <- doesFileExist my_article
68 case is_file of
69 True -> do
70 contents <- readFile my_article
71 return $ Just $ parse_lwn contents
72 False -> do
73 -- Download the URL and try to parse it.
74 if use_account cfg then do
75 -- use_account would be false if these fromJusts would fail.
76 cj <- make_cookie_jar
77 li_result <- log_in cj
78 (fromJust $ username cfg)
79 (fromJust $ password cfg)
80
81 case li_result of
82 Left err -> do
83 let msg = "Failed to log in. " ++ err
84 hPutStrLn stderr msg
85 Right response_body -> do
86 hPutStrLn stderr response_body
87
88 html <- get_page (Just cj) my_article
89
90 case html of
91 Left err -> do
92 let msg = "Failed to retrieve page. " ++ err
93 hPutStrLn stderr msg
94 return Nothing
95 Right h -> return $ Just $ parse_lwn h
96 else do
97 html <- get_page Nothing my_article
98 case html of
99 Left err -> do
100 let msg = "Failed to retrieve page. " ++ err
101 hPutStrLn stderr msg
102 return Nothing
103 Right h -> return $ Just $ parse_lwn h
104
105
106
107 -- Should be called *after* preprocessing.
108 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
109 download_images xml = do
110 image_urls <- runX $ xml >>> image_srcs
111 download_image_urls image_urls
112
113
114 data Page =
115 -- | An LWN page with one article on it.
116 ArticlePage { article :: Article } |
117
118 -- | An LWN page with more than one article on it. These require
119 -- different parsing and display functions than the single-article
120 -- pages.
121 FullPage { headline :: String,
122 articles :: [Article] }
123
124
125 instance XHTML Page where
126 to_xhtml (ArticlePage a) =
127 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
128 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
129 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
130 "<head>" ++
131 " <meta http-equiv=\"Content-Type\"" ++
132 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
133 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
134 "</head>" ++
135 "<body>" ++
136 (to_xhtml a) ++
137 "</body>" ++
138 "</html>"
139
140 to_xhtml (FullPage hl as) =
141 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
142 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
143 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
144 "<head>" ++
145 " <meta http-equiv=\"Content-Type\"" ++
146 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
147 " <title>" ++ hl ++ "</title>" ++
148 "</head>" ++
149 "<body>" ++
150 "<div>" ++
151 "<h1>" ++ hl ++ "</h1>" ++
152 (concatMap to_xhtml as) ++
153 "</div>" ++
154 "</body>" ++
155 "</html>"
156
157
158
159 page_from_url :: Cfg -> URL -> IO (Maybe Page)
160 page_from_url cfg url = do
161 maybe_html <- get_xml_from_article cfg url
162 case maybe_html of
163 Just html -> parse html
164 Nothing -> return Nothing
165
166
167 is_link :: (ArrowXml a) => a XmlTree XmlTree
168 is_link =
169 isElem >>> hasName "a"
170
171
172 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
173 remove_comment_links =
174 processTopDown $ kill_comments `when` is_link
175 where
176 is_comment_link =
177 hasAttrValue "href" (contains "#Comments")
178
179 kill_comments =
180 none `when` is_comment_link
181
182 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
183 replace_links_with_spans =
184 processTopDown $ (make_span >>> remove_attrs) `when` is_link
185 where
186 make_span = setElemName $ mkName "span"
187 remove_attrs = processAttrl none
188
189
190 -- | Preprocessing common to both page types.
191 preprocess :: (ArrowXml a) => a XmlTree XmlTree
192 preprocess =
193 make_image_srcs_absolute
194 >>>
195 remove_comment_links
196 >>>
197 replace_links_with_spans
198
199
200 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
201 replace_remote_img_srcs image_map =
202 processTopDown (make_srcs_local `when` is_image)
203 where
204 -- old_src -> new_src
205 change_src_func :: String -> String
206 change_src_func old_src =
207 case Map.lookup old_src image_map of
208 -- Leave it alone if we don't have the file locally
209 Nothing -> old_src
210 Just v -> v
211
212 change_src :: (ArrowXml a) => a XmlTree XmlTree
213 change_src =
214 changeAttrValue change_src_func
215
216 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
217 make_srcs_local =
218 processAttrl $ (change_src `when` (hasName "src"))
219
220
221 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
222 parse xml = do
223 let clean_xml = xml >>> preprocess
224 image_map <- download_images clean_xml
225 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
226 appr <- ap_parse local_xml
227 fppr <- fp_parse local_xml
228 return $
229 if (isNothing appr) then
230 fppr
231 else
232 appr
233
234
235
236 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
237 parse_headline xml = do
238 let element_filter = xml >>> css "div.PageHeadline h1"
239 let element_text_filter = element_filter /> getText
240 element_text <- runX element_text_filter
241 return $
242 case element_text of
243 [x] -> Just $ strip x
244 [] -> Nothing
245 _ -> error "Found more than one headline."
246
247
248 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
249 parse_byline xml = do
250 let element_filter = xml >>> css "div.FeatureByLine"
251 let element_text_filter = element_filter /> getText
252 element_text <- runX element_text_filter
253 return $
254 case element_text of
255 [x] -> Just $ strip x
256 [] -> Nothing
257 _ -> error "Found more than one article byline."
258
259
260 --
261 -- ArticlePage Stuff
262 --
263 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
264 ap_parse xml = do
265 arts <- ap_parse_articles xml
266 case arts of
267 [x] -> return $ Just $ ArticlePage x
268 _ -> return Nothing
269
270
271 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
272 ap_parse_body xml = do
273 let element_filter = xml >>> css "div.ArticleText"
274 let element_html_filter = xshow element_filter
275 element_html <- runX element_html_filter
276 return $ case element_html of
277 [x] -> Just x
278 [] -> Nothing
279 _ -> error "Found more than one article."
280
281
282 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
283 ap_parse_articles xml = do
284 parsed_headline <- parse_headline xml
285 parsed_byline <- parse_byline xml
286 parsed_body <- ap_parse_body xml
287
288 putStrLn $ fromJust parsed_headline
289
290 if (isNothing parsed_headline) || (isNothing parsed_body)
291 then return []
292 else do
293 let title' = Title $ fromJust parsed_headline
294 let byline' = Byline parsed_byline
295 let body' = BodyHtml $ fromJust parsed_body
296
297 return $ [Article title' byline' body']
298
299
300
301 --
302 -- FullPage Stuff
303 --
304
305 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
306 fp_parse xml = do
307 hl <- parse_headline xml
308 parsed_articles <- fp_parse_articles xml
309 case parsed_articles of
310 [] -> return Nothing
311 x -> return $ Just $ FullPage (fromJust hl) x
312
313
314
315 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
316 fp_parse_article_title xml = do
317 let element_filter = xml >>> css "h2.SummaryHL"
318 let element_text_filter = element_filter //> getText
319 element_text <- runX element_text_filter
320 return $ case element_text of
321 [x] -> Just $ strip x
322 [] -> Nothing
323 _ -> error "Found more than one article title."
324
325
326
327 is_title :: (ArrowXml a) => a XmlTree XmlTree
328 is_title =
329 (hasName "h2")
330 >>>
331 (hasAttrValue "class" (== "SummaryHL"))
332
333
334 is_byline :: (ArrowXml a) => a XmlTree XmlTree
335 is_byline =
336 (hasName "div")
337 >>>
338 (hasAttrValue "class" (== "FeatureByLine"))
339
340
341 is_image :: (ArrowXml a) => a XmlTree XmlTree
342 is_image = isElem >>> hasName "img"
343
344 remove_title :: (ArrowXml a) => a XmlTree XmlTree
345 remove_title =
346 processTopDown ((none) `when` is_title)
347
348
349 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
350 remove_byline =
351 processTopDown ((none) `when` is_byline)
352
353
354
355 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
356 fp_parse_article_body xml = do
357 -- First, delete the article title and byline.
358 let clean_xml' = xml >>> remove_title >>> remove_byline
359 -- The only child of the body element should be a div.lwn-article
360 -- since we wrapped the article's HTML in that.
361 let clean_xml = clean_xml' >>> css "body" >>> getChildren
362 clean_html <- runX . xshow $ clean_xml
363 return $ case clean_html of
364 [x] -> Just x
365 [] -> Nothing
366 _ -> error "Found more than one article body."
367
368 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
369 fp_parse_article xml = do
370 parsed_article_title <- fp_parse_article_title xml
371 parsed_article_byline <- parse_byline xml
372 parsed_article_body <- fp_parse_article_body xml
373
374 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
375 then
376 return Nothing
377 else do
378 let title' = Title $ fromJust parsed_article_title
379 let byline' = Byline parsed_article_byline
380 let body' = BodyHtml $ fromJust parsed_article_body
381 return $ Just $ Article title' byline' body'
382
383 parse_html_article :: String -> IO (Maybe Article)
384 parse_html_article html = do
385 let xml = parseHtml $ wrap_in_body_div html
386 fp_parse_article xml
387
388
389 -- | In the full page, all of the article titles and bodies are
390 -- wrapped in one big div.ArticleText.
391 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
392 parse_bodies xml =
393 xml >>> css "div.ArticleText"
394
395
396 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
397 fp_parse_articles xml = do
398 bodies <- runX . xshow $ parse_bodies xml
399 let article_separator = "<h2 class=\"SummaryHL\">"
400 let split_articles'' = split article_separator (concat bodies)
401 -- The first element will contain the crap before the first <h2...>.
402 let split_articles' = case split_articles'' of
403 (_:_) -> tail split_articles''
404 [] -> []
405 -- Put the separator back, it was lost during the split.
406 let split_articles = map (article_separator ++) split_articles'
407 --_ <- mapM print_article split_articles
408 real_articles <- mapM parse_html_article split_articles
409 let just_articles = catMaybes real_articles
410 return just_articles
411
412
413 -- | This makes it easy to select otherwise-random chunks of html
414 -- using 'css'.
415 wrap_in_body_div :: String -> String
416 wrap_in_body_div s =
417 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
418
419
420
421
422 --
423 -- Epublishable stuff
424 --
425
426 title :: Page -> String
427 title (ArticlePage a) = getTitle $ LWN.Article.title a
428 title (FullPage hl _) = hl
429
430
431 metadata :: Page -> IO String
432 metadata obj = do
433 date <- getCurrentTime
434 return $
435 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
436 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
437 "<dc:language>en-US</dc:language>\n" ++
438 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
439 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
440
441
442 epublish :: Page -> Handle -> IO ()
443 epublish obj handle = do
444 let xhtml = to_xhtml obj
445 epmd <- metadata obj
446 epub <- xhtml_to_epub epmd xhtml
447 B.hPut handle epub
448 hFlush handle
449 hClose handle
450
451 xhtml_to_epub :: String -> String -> IO B.ByteString
452 xhtml_to_epub epmd =
453 write_epub . read_html
454 where
455 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
456 write_epub = writeEPUB Nothing [] my_writer_options
457 read_html = readHtml defaultParserState
458
459
460
461 --
462 -- Misc
463 --
464
465 image_srcs :: (ArrowXml a) => a XmlTree URL
466 image_srcs =
467 css "img"
468 >>>
469 getAttrValue "src"
470
471 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
472 make_image_srcs_absolute =
473 processTopDown (make_srcs_absolute `when` is_image)
474 where
475 change_src :: (ArrowXml a) => a XmlTree XmlTree
476 change_src =
477 changeAttrValue try_make_absolute_url
478
479 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
480 make_srcs_absolute =
481 processAttrl $ change_src `when` hasName "src"
482
483
484 --
485 -- Tests
486 --
487
488 test_preprocess_links :: Assertion
489 test_preprocess_links = do
490 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
491 let actual_xml = actual_xml' !! 0
492
493 expected_xml' <- runX $ expected_xml'' >>> css "body"
494 let expected_xml = expected_xml' !! 0
495
496 assertEqual
497 "Links are replaced with spans"
498 expected_xml
499 actual_xml
500 where
501 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
502 input_xml = parseHtml input_html
503 expected_html = "<body><span>Hello, world!</span></body>"
504 expected_xml'' = parseHtml expected_html
505
506
507 test_absolve_images :: Assertion
508 test_absolve_images = do
509 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
510 let actual_xml = actual_xml' !! 0
511
512 expected_xml' <- runX $ expected_xml'' >>> css "body"
513 let expected_xml = expected_xml' !! 0
514
515 assertEqual
516 "Image srcs are made absolute"
517 expected_xml
518 actual_xml
519 where
520 input_html =
521 "<body>" ++
522 "<img src=\"/images/2012/example.jpg\" />" ++
523 "</body>"
524 input_xml = parseHtml input_html
525 expected_html =
526 "<body>" ++
527 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
528 "</body>"
529 expected_xml'' = parseHtml expected_html
530
531
532 test_comments_removed :: Assertion
533 test_comments_removed = do
534 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
535 let actual_xml = actual_xml' !! 0
536
537 expected_xml' <- runX $ expected_xml'' >>> css "body"
538 let expected_xml = expected_xml' !! 0
539
540 assertEqual
541 "Comment links are removed"
542 expected_xml
543 actual_xml
544 where
545 input_html =
546 "<body><p>" ++
547 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
548 "</p></body>"
549 input_xml = parseHtml input_html
550
551 expected_html = "<body><p></p></body>"
552 expected_xml'' = parseHtml expected_html
553
554
555
556 page_tests :: Test
557 page_tests =
558 testGroup "Page Tests" [
559 testCase "Links are replaced with spans" test_preprocess_links,
560 testCase "Image srcs are made absolute" test_absolve_images,
561 testCase "Comment links are removed" test_comments_removed ]