]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
Move the main processing functions into the LWN.Page module.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import qualified Data.Map as Map (lookup)
7 import Data.Time (getCurrentTime)
8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
9 import Data.String.Utils (split, strip)
10 import Data.Maybe (catMaybes, fromJust, isNothing)
11 import Prelude hiding (readFile)
12 import System.Directory (doesFileExist)
13 import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr)
14 import System.IO.UTF8 (readFile)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
18 import Text.Pandoc (
19 defaultParserState,
20 defaultWriterOptions,
21 readHtml,
22 writeEPUB,
23 writerEPUBMetadata)
24 import Text.XML.HXT.Core (
25 ArrowXml,
26 IOSArrow,
27 IOStateArrow,
28 XmlTree,
29 (>>>),
30 (/>),
31 (//>),
32 changeAttrValue,
33 getAttrValue,
34 getChildren,
35 getText,
36 hasAttrValue,
37 hasName,
38 isElem,
39 mkName,
40 none,
41 processAttrl,
42 processTopDown,
43 runX,
44 setElemName,
45 xshow,
46 when)
47 import Text.HandsomeSoup (css, parseHtml)
48
49 import Configuration (Cfg, password, use_account, username)
50 import LWN.Article
51 import LWN.HTTP (
52 ImageMap,
53 download_image_urls,
54 get_page,
55 log_in,
56 make_cookie_jar)
57 import LWN.URI (URL, try_make_absolute_url)
58 import LWN.XHTML (XHTML, parse_lwn, to_xhtml)
59 import Misc (contains)
60
61
62 -- | Try to parse the given article using HXT. We try a few different
63 -- methods; if none of them work, we return 'Nothing'.
64 get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree))
65 get_xml_from_article cfg article_name = do
66 my_article <- real_article_path article_name
67 is_file <- doesFileExist my_article
68 case is_file of
69 True -> do
70 contents <- readFile my_article
71 return $ Just $ parse_lwn contents
72 False -> do
73 -- Download the URL and try to parse it.
74 if use_account cfg then do
75 -- use_account would be false if these fromJusts would fail.
76 cj <- make_cookie_jar
77 li_result <- log_in cj
78 (fromJust $ username cfg)
79 (fromJust $ password cfg)
80
81 case li_result of
82 Left err -> do
83 let msg = "Failed to log in. " ++ err
84 hPutStrLn stderr msg
85 Right response_body -> do
86 hPutStrLn stderr response_body
87
88 html <- get_page (Just cj) my_article
89
90 case html of
91 Left err -> do
92 let msg = "Failed to retrieve page. " ++ err
93 hPutStrLn stderr msg
94 return Nothing
95 Right h -> return $ Just $ parse_lwn h
96 else do
97 html <- get_page Nothing my_article
98 case html of
99 Left err -> do
100 let msg = "Failed to retrieve page. " ++ err
101 hPutStrLn stderr msg
102 return Nothing
103 Right h -> return $ Just $ parse_lwn h
104
105
106
107 -- Should be called *after* preprocessing.
108 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
109 download_images xml = do
110 image_urls <- runX $ xml >>> image_srcs
111 download_image_urls image_urls
112
113
114 data Page =
115 -- | An LWN page with one article on it.
116 ArticlePage { article :: Article } |
117
118 -- | An LWN page with more than one article on it. These require
119 -- different parsing and display functions than the single-article
120 -- pages.
121 FullPage { headline :: String,
122 articles :: [Article] }
123
124
125 instance XHTML Page where
126 to_xhtml (ArticlePage a) =
127 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
128 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
129 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
130 "<head>" ++
131 " <meta http-equiv=\"Content-Type\"" ++
132 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
133 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
134 "</head>" ++
135 "<body>" ++
136 "<div>" ++
137 (to_xhtml a) ++
138 "</div>" ++
139 "</body>" ++
140 "</html>"
141
142 to_xhtml (FullPage hl as) =
143 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
144 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
145 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
146 "<head>" ++
147 " <meta http-equiv=\"Content-Type\"" ++
148 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
149 " <title>" ++ hl ++ "</title>" ++
150 "</head>" ++
151 "<body>" ++
152 "<div>" ++
153 "<h1>" ++ hl ++ "</h1>" ++
154 (concatMap to_xhtml as) ++
155 "</div>" ++
156 "</body>" ++
157 "</html>"
158
159
160
161 page_from_url :: Cfg -> URL -> IO (Maybe Page)
162 page_from_url cfg url = do
163 maybe_html <- get_xml_from_article cfg url
164 case maybe_html of
165 Just html -> parse html
166 Nothing -> return Nothing
167
168
169 is_link :: (ArrowXml a) => a XmlTree XmlTree
170 is_link =
171 isElem >>> hasName "a"
172
173
174 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
175 remove_comment_links =
176 processTopDown $ kill_comments `when` is_link
177 where
178 is_comment_link =
179 hasAttrValue "href" (contains "#Comments")
180
181 kill_comments =
182 none `when` is_comment_link
183
184 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
185 replace_links_with_spans =
186 processTopDown $ (make_span >>> remove_attrs) `when` is_link
187 where
188 make_span = setElemName $ mkName "span"
189 remove_attrs = processAttrl none
190
191
192 -- | Preprocessing common to both page types.
193 preprocess :: (ArrowXml a) => a XmlTree XmlTree
194 preprocess =
195 make_image_srcs_absolute
196 >>>
197 remove_comment_links
198 >>>
199 replace_links_with_spans
200
201
202 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
203 replace_remote_img_srcs image_map =
204 processTopDown (make_srcs_local `when` is_image)
205 where
206 -- old_src -> new_src
207 change_src_func :: String -> String
208 change_src_func old_src =
209 case Map.lookup old_src image_map of
210 -- Leave it alone if we don't have the file locally
211 Nothing -> old_src
212 Just v -> v
213
214 change_src :: (ArrowXml a) => a XmlTree XmlTree
215 change_src =
216 changeAttrValue change_src_func
217
218 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
219 make_srcs_local =
220 processAttrl $ (change_src `when` (hasName "src"))
221
222
223 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
224 parse xml = do
225 let clean_xml = xml >>> preprocess
226 image_map <- download_images clean_xml
227 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
228 appr <- ap_parse local_xml
229 fppr <- fp_parse local_xml
230 return $
231 if (isNothing appr) then
232 fppr
233 else
234 appr
235
236
237
238 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
239 parse_headline xml = do
240 let element_filter = xml >>> css "div.PageHeadline h1"
241 let element_text_filter = element_filter /> getText
242 element_text <- runX element_text_filter
243 return $
244 case element_text of
245 [x] -> Just $ strip x
246 [] -> Nothing
247 _ -> error "Found more than one headline."
248
249
250 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
251 parse_byline xml = do
252 let element_filter = xml >>> css "div.FeatureByLine"
253 let element_text_filter = element_filter /> getText
254 element_text <- runX element_text_filter
255 return $
256 case element_text of
257 [x] -> Just $ strip x
258 [] -> Nothing
259 _ -> error "Found more than one article byline."
260
261
262 --
263 -- ArticlePage Stuff
264 --
265 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
266 ap_parse xml = do
267 arts <- ap_parse_articles xml
268 case arts of
269 [x] -> return $ Just $ ArticlePage x
270 _ -> return Nothing
271
272
273 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
274 ap_parse_body xml = do
275 let element_filter = xml >>> css "div.ArticleText"
276 let element_html_filter = xshow element_filter
277 element_html <- runX element_html_filter
278 return $ case element_html of
279 [x] -> Just x
280 [] -> Nothing
281 _ -> error "Found more than one article."
282
283
284 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
285 ap_parse_articles xml = do
286 parsed_headline <- parse_headline xml
287 parsed_byline <- parse_byline xml
288 parsed_body <- ap_parse_body xml
289
290 putStrLn $ fromJust parsed_headline
291
292 if (isNothing parsed_headline) || (isNothing parsed_body)
293 then return []
294 else do
295 let title' = Title $ fromJust parsed_headline
296 let byline' = Byline parsed_byline
297 let body' = BodyHtml $ fromJust parsed_body
298
299 return $ [Article title' byline' body']
300
301
302
303 --
304 -- FullPage Stuff
305 --
306
307 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
308 fp_parse xml = do
309 hl <- parse_headline xml
310 parsed_articles <- fp_parse_articles xml
311 case parsed_articles of
312 [] -> return Nothing
313 x -> return $ Just $ FullPage (fromJust hl) x
314
315
316
317 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
318 fp_parse_article_title xml = do
319 let element_filter = xml >>> css "h2.SummaryHL"
320 let element_text_filter = element_filter //> getText
321 element_text <- runX element_text_filter
322 return $ case element_text of
323 [x] -> Just $ strip x
324 [] -> Nothing
325 _ -> error "Found more than one article title."
326
327
328
329 is_title :: (ArrowXml a) => a XmlTree XmlTree
330 is_title =
331 (hasName "h2")
332 >>>
333 (hasAttrValue "class" (== "SummaryHL"))
334
335
336 is_byline :: (ArrowXml a) => a XmlTree XmlTree
337 is_byline =
338 (hasName "div")
339 >>>
340 (hasAttrValue "class" (== "FeatureByLine"))
341
342
343 is_image :: (ArrowXml a) => a XmlTree XmlTree
344 is_image = isElem >>> hasName "img"
345
346 remove_title :: (ArrowXml a) => a XmlTree XmlTree
347 remove_title =
348 processTopDown ((none) `when` is_title)
349
350
351 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
352 remove_byline =
353 processTopDown ((none) `when` is_byline)
354
355
356
357 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
358 fp_parse_article_body xml = do
359 -- First, delete the article title and byline.
360 let clean_xml' = xml >>> remove_title >>> remove_byline
361 -- The only child of the body element should be a div.lwn-article
362 -- since we wrapped the article's HTML in that.
363 let clean_xml = clean_xml' >>> css "body" >>> getChildren
364 clean_html <- runX . xshow $ clean_xml
365 return $ case clean_html of
366 [x] -> Just x
367 [] -> Nothing
368 _ -> error "Found more than one article body."
369
370 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
371 fp_parse_article xml = do
372 parsed_article_title <- fp_parse_article_title xml
373 parsed_article_byline <- parse_byline xml
374 parsed_article_body <- fp_parse_article_body xml
375
376 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
377 then
378 return Nothing
379 else do
380 let title' = Title $ fromJust parsed_article_title
381 let byline' = Byline parsed_article_byline
382 let body' = BodyHtml $ fromJust parsed_article_body
383 return $ Just $ Article title' byline' body'
384
385 parse_html_article :: String -> IO (Maybe Article)
386 parse_html_article html = do
387 let xml = parseHtml $ wrap_in_body_div html
388 fp_parse_article xml
389
390
391 -- | In the full page, all of the article titles and bodies are
392 -- wrapped in one big div.ArticleText.
393 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
394 parse_bodies xml =
395 xml >>> css "div.ArticleText"
396
397
398 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
399 fp_parse_articles xml = do
400 bodies <- runX . xshow $ parse_bodies xml
401 let article_separator = "<h2 class=\"SummaryHL\">"
402 let split_articles'' = split article_separator (concat bodies)
403 -- The first element will contain the crap before the first <h2...>.
404 let split_articles' = case split_articles'' of
405 (_:_) -> tail split_articles''
406 [] -> []
407 -- Put the separator back, it was lost during the split.
408 let split_articles = map (article_separator ++) split_articles'
409 --_ <- mapM print_article split_articles
410 real_articles <- mapM parse_html_article split_articles
411 let just_articles = catMaybes real_articles
412 return just_articles
413
414
415 -- | This makes it easy to select otherwise-random chunks of html
416 -- using 'css'.
417 wrap_in_body_div :: String -> String
418 wrap_in_body_div s =
419 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
420
421
422
423
424 --
425 -- Epublishable stuff
426 --
427
428 title :: Page -> String
429 title (ArticlePage a) = getTitle $ LWN.Article.title a
430 title (FullPage hl _) = hl
431
432
433 metadata :: Page -> IO String
434 metadata obj = do
435 date <- getCurrentTime
436 return $
437 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
438 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
439 "<dc:language>en-US</dc:language>\n" ++
440 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
441 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
442
443
444 epublish :: Page -> Handle -> IO ()
445 epublish obj handle = do
446 let xhtml = to_xhtml obj
447 epmd <- metadata obj
448 epub <- xhtml_to_epub epmd xhtml
449 B.hPut handle epub
450 hFlush handle
451 hClose handle
452
453 xhtml_to_epub :: String -> String -> IO B.ByteString
454 xhtml_to_epub epmd =
455 write_epub . read_html
456 where
457 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
458 write_epub = writeEPUB Nothing [] my_writer_options
459 read_html = readHtml defaultParserState
460
461
462
463 --
464 -- Misc
465 --
466
467 image_srcs :: (ArrowXml a) => a XmlTree URL
468 image_srcs =
469 css "img"
470 >>>
471 getAttrValue "src"
472
473 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
474 make_image_srcs_absolute =
475 processTopDown (make_srcs_absolute `when` is_image)
476 where
477 change_src :: (ArrowXml a) => a XmlTree XmlTree
478 change_src =
479 changeAttrValue try_make_absolute_url
480
481 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
482 make_srcs_absolute =
483 processAttrl $ change_src `when` hasName "src"
484
485
486 --
487 -- Tests
488 --
489
490 test_preprocess_links :: Assertion
491 test_preprocess_links = do
492 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
493 let actual_xml = actual_xml' !! 0
494
495 expected_xml' <- runX $ expected_xml'' >>> css "body"
496 let expected_xml = expected_xml' !! 0
497
498 assertEqual
499 "Links are replaced with spans"
500 expected_xml
501 actual_xml
502 where
503 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
504 input_xml = parseHtml input_html
505 expected_html = "<body><span>Hello, world!</span></body>"
506 expected_xml'' = parseHtml expected_html
507
508
509 test_absolve_images :: Assertion
510 test_absolve_images = do
511 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
512 let actual_xml = actual_xml' !! 0
513
514 expected_xml' <- runX $ expected_xml'' >>> css "body"
515 let expected_xml = expected_xml' !! 0
516
517 assertEqual
518 "Image srcs are made absolute"
519 expected_xml
520 actual_xml
521 where
522 input_html =
523 "<body>" ++
524 "<img src=\"/images/2012/example.jpg\" />" ++
525 "</body>"
526 input_xml = parseHtml input_html
527 expected_html =
528 "<body>" ++
529 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
530 "</body>"
531 expected_xml'' = parseHtml expected_html
532
533
534 test_comments_removed :: Assertion
535 test_comments_removed = do
536 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
537 let actual_xml = actual_xml' !! 0
538
539 expected_xml' <- runX $ expected_xml'' >>> css "body"
540 let expected_xml = expected_xml' !! 0
541
542 assertEqual
543 "Comment links are removed"
544 expected_xml
545 actual_xml
546 where
547 input_html =
548 "<body><p>" ++
549 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
550 "</p></body>"
551 input_xml = parseHtml input_html
552
553 expected_html = "<body><p></p></body>"
554 expected_xml'' = parseHtml expected_html
555
556
557
558 page_tests :: Test
559 page_tests =
560 testGroup "Page Tests" [
561 testCase "Links are replaced with spans" test_preprocess_links,
562 testCase "Image srcs are made absolute" test_absolve_images,
563 testCase "Comment links are removed" test_comments_removed ]