src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import qualified Data.Map as Map (lookup)
   7 import Data.Time (getCurrentTime)
   8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
   9 import Data.String.Utils (split, strip)
  10 import Data.Maybe (catMaybes, fromJust, isNothing)
  11 import Prelude hiding (readFile)
  12 import System.Directory (doesFileExist)
  13 import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr)
  14 import System.IO.UTF8 (readFile)
  15 import Test.HUnit (Assertion, assertEqual)
  16 import Test.Framework (Test, testGroup)
  17 import Test.Framework.Providers.HUnit (testCase)
  18 import Text.Pandoc (
  19   defaultParserState,
  20   defaultWriterOptions,
  21   readHtml,
  22   writeEPUB,
  23   writerEPUBMetadata)
  24 import Text.XML.HXT.Core (
  25   ArrowXml,
  26   IOSArrow,
  27   IOStateArrow,
  28   XmlTree,
  29   (>>>),
  30   (/>),
  31   (//>),
  32   changeAttrValue,
  33   getAttrValue,
  34   getChildren,
  35   getText,
  36   hasAttrValue,
  37   hasName,
  38   isElem,
  39   mkName,
  40   none,
  41   processAttrl,
  42   processTopDown,
  43   runX,
  44   setElemName,
  45   xshow,
  46   when)
  47 import Text.HandsomeSoup (css, parseHtml)
  48
  49 import Configuration (Cfg, password, use_account, username)
  50 import LWN.Article
  51 import LWN.HTTP (
  52   ImageMap,
  53   download_image_urls,
  54   get_page,
  55   log_in,
  56   make_cookie_jar)
  57 import LWN.URI (URL, try_make_absolute_url)
  58 import LWN.XHTML (XHTML, parse_lwn, to_xhtml)
  59 import Misc (contains)
  60
  61
  62 -- | Try to parse the given article using HXT. We try a few different
  63 --   methods; if none of them work, we return 'Nothing'.
  64 get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree))
  65 get_xml_from_article cfg article_name = do
  66   my_article <- real_article_path article_name
  67   is_file <- doesFileExist my_article
  68   case is_file of
  69     True -> do
  70       contents <- readFile my_article
  71       return $ Just $ parse_lwn contents
  72     False -> do
  73       -- Download the URL and try to parse it.
  74       if use_account cfg then do
  75         -- use_account would be false if these fromJusts would fail.
  76         cj <- make_cookie_jar
  77         li_result <- log_in cj
  78                       (fromJust $ username cfg)
  79                       (fromJust $ password cfg)
  80
  81         case li_result of
  82           Left err -> do
  83             let msg = "Failed to log in. " ++ err
  84             hPutStrLn stderr msg
  85           Right response_body -> do
  86             hPutStrLn stderr response_body
  87
  88         html <- get_page (Just cj) my_article
  89
  90         case html of
  91           Left err -> do
  92             let msg = "Failed to retrieve page. " ++ err
  93             hPutStrLn stderr msg
  94             return Nothing
  95           Right h -> return $ Just $ parse_lwn h
  96       else do
  97         html <- get_page Nothing my_article
  98         case html of
  99           Left err -> do
 100             let msg = "Failed to retrieve page. " ++ err
 101             hPutStrLn stderr msg
 102             return Nothing
 103           Right h -> return $ Just $ parse_lwn h
 104
 105
 106
 107 -- Should be called *after* preprocessing.
 108 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 109 download_images xml = do
 110   image_urls <- runX $ xml >>> image_srcs
 111   download_image_urls image_urls
 112
 113
 114 data Page =
 115   -- | An LWN page with one article on it.
 116   ArticlePage { article :: Article } |
 117
 118   -- | An LWN page with more than one article on it. These require
 119   --   different parsing and display functions than the single-article
 120   --   pages.
 121   FullPage { headline :: String,
 122              articles :: [Article] }
 123
 124
 125 instance XHTML Page where
 126   to_xhtml (ArticlePage a) =
 127     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
 128     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
 129     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
 130     "<head>" ++
 131     "  <meta http-equiv=\"Content-Type\"" ++
 132     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 133     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
 134     "</head>" ++
 135     "<body>" ++
 136     (to_xhtml a) ++
 137     "</body>" ++
 138     "</html>"
 139
 140   to_xhtml (FullPage hl as) =
 141     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
 142     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
 143     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
 144     "<head>" ++
 145     "  <meta http-equiv=\"Content-Type\"" ++
 146     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 147     "  <title>" ++ hl ++ "</title>" ++
 148     "</head>" ++
 149     "<body>" ++
 150     "<div>" ++
 151     "<h1>" ++ hl ++ "</h1>" ++
 152     (concatMap to_xhtml as) ++
 153     "</div>" ++
 154     "</body>" ++
 155     "</html>"
 156
 157
 158
 159 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 160 page_from_url cfg url = do
 161   maybe_html <- get_xml_from_article cfg url
 162   case maybe_html of
 163     Just html -> parse html
 164     Nothing -> return Nothing
 165
 166
 167 is_link :: (ArrowXml a) => a XmlTree XmlTree
 168 is_link =
 169   isElem >>> hasName "a"
 170
 171
 172 remove_comment_links  :: (ArrowXml a) => a XmlTree XmlTree
 173 remove_comment_links =
 174   processTopDown $ kill_comments `when` is_link
 175   where
 176     is_comment_link =
 177       hasAttrValue "href" (contains "#Comments")
 178
 179     kill_comments =
 180       none `when` is_comment_link
 181
 182 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
 183 replace_links_with_spans =
 184   processTopDown $ (make_span >>> remove_attrs) `when` is_link
 185   where
 186     make_span    = setElemName $ mkName "span"
 187     remove_attrs = processAttrl none
 188
 189
 190 -- | Preprocessing common to both page types.
 191 preprocess :: (ArrowXml a) => a XmlTree XmlTree
 192 preprocess =
 193   make_image_srcs_absolute
 194   >>>
 195   remove_comment_links
 196   >>>
 197   replace_links_with_spans
 198
 199
 200 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 201 replace_remote_img_srcs image_map =
 202   processTopDown (make_srcs_local `when` is_image)
 203   where
 204     -- old_src -> new_src
 205     change_src_func :: String -> String
 206     change_src_func old_src =
 207       case Map.lookup old_src image_map of
 208         -- Leave it alone if we don't have the file locally
 209         Nothing -> old_src
 210         Just v -> v
 211
 212     change_src :: (ArrowXml a) => a XmlTree XmlTree
 213     change_src =
 214       changeAttrValue change_src_func
 215
 216     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 217     make_srcs_local =
 218       processAttrl $ (change_src `when` (hasName "src"))
 219
 220
 221 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 222 parse xml = do
 223   let clean_xml = xml >>> preprocess
 224   image_map <- download_images clean_xml
 225   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 226   appr <- ap_parse local_xml
 227   fppr <- fp_parse local_xml
 228   return $
 229     if (isNothing appr) then
 230       fppr
 231     else
 232       appr
 233
 234
 235
 236 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 237 parse_headline xml = do
 238   let element_filter = xml >>> css "div.PageHeadline h1"
 239   let element_text_filter = element_filter /> getText
 240   element_text <- runX element_text_filter
 241   return $
 242     case element_text of
 243       [x] -> Just $ strip x
 244       []  -> Nothing
 245       _   -> error "Found more than one headline."
 246
 247
 248 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 249 parse_byline xml = do
 250   let element_filter = xml >>> css "div.FeatureByLine"
 251   let element_text_filter = element_filter /> getText
 252   element_text <- runX element_text_filter
 253   return $
 254     case element_text of
 255       [x] -> Just $ strip x
 256       []  -> Nothing
 257       _   -> error "Found more than one article byline."
 258
 259
 260 --
 261 -- ArticlePage Stuff
 262 --
 263 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 264 ap_parse xml = do
 265     arts <- ap_parse_articles xml
 266     case arts of
 267       [x] -> return $ Just $ ArticlePage x
 268       _   -> return Nothing
 269
 270
 271 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 272 ap_parse_body xml = do
 273   let element_filter = xml >>> css "div.ArticleText"
 274   let element_html_filter = xshow element_filter
 275   element_html <- runX element_html_filter
 276   return $ case element_html of
 277             [x] -> Just x
 278             []  -> Nothing
 279             _   -> error "Found more than one article."
 280
 281
 282 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 283 ap_parse_articles xml = do
 284   parsed_headline <- parse_headline xml
 285   parsed_byline   <- parse_byline xml
 286   parsed_body     <- ap_parse_body xml
 287
 288   putStrLn $ fromJust parsed_headline
 289
 290   if (isNothing parsed_headline) || (isNothing parsed_body)
 291   then return []
 292   else do
 293     let title'  = Title    $ fromJust parsed_headline
 294     let byline' = Byline     parsed_byline
 295     let body'   = BodyHtml $ fromJust parsed_body
 296
 297     return $ [Article title' byline' body']
 298
 299
 300
 301 --
 302 -- FullPage Stuff
 303 --
 304
 305 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 306 fp_parse xml = do
 307     hl <- parse_headline xml
 308     parsed_articles <- fp_parse_articles xml
 309     case parsed_articles of
 310       []          -> return Nothing
 311       x -> return $ Just $ FullPage (fromJust hl) x
 312
 313
 314
 315 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 316 fp_parse_article_title xml = do
 317   let element_filter = xml >>> css "h2.SummaryHL"
 318   let element_text_filter = element_filter //> getText
 319   element_text <- runX element_text_filter
 320   return $ case element_text of
 321             [x] -> Just $ strip x
 322             []  -> Nothing
 323             _   -> error "Found more than one article title."
 324
 325
 326
 327 is_title :: (ArrowXml a) => a XmlTree XmlTree
 328 is_title =
 329   (hasName "h2")
 330   >>>
 331   (hasAttrValue "class" (== "SummaryHL"))
 332
 333
 334 is_byline :: (ArrowXml a) => a XmlTree XmlTree
 335 is_byline =
 336   (hasName "div")
 337   >>>
 338   (hasAttrValue "class" (== "FeatureByLine"))
 339
 340
 341 is_image :: (ArrowXml a) => a XmlTree XmlTree
 342 is_image = isElem >>> hasName "img"
 343
 344 remove_title :: (ArrowXml a) => a XmlTree XmlTree
 345 remove_title =
 346   processTopDown ((none) `when` is_title)
 347
 348
 349 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
 350 remove_byline =
 351   processTopDown ((none) `when` is_byline)
 352
 353
 354
 355 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 356 fp_parse_article_body xml = do
 357   -- First, delete the article title and byline.
 358   let clean_xml' = xml >>> remove_title >>> remove_byline
 359   -- The only child of the body element should be a div.lwn-article
 360   -- since we wrapped the article's HTML in that.
 361   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 362   clean_html <- runX . xshow $ clean_xml
 363   return $ case clean_html of
 364             [x] -> Just x
 365             []  -> Nothing
 366             _   -> error "Found more than one article body."
 367
 368 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 369 fp_parse_article xml = do
 370   parsed_article_title    <- fp_parse_article_title xml
 371   parsed_article_byline   <- parse_byline xml
 372   parsed_article_body     <- fp_parse_article_body xml
 373
 374   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 375   then
 376     return Nothing
 377   else do
 378     let title'   = Title    $ fromJust parsed_article_title
 379     let byline'  = Byline     parsed_article_byline
 380     let body'    = BodyHtml $ fromJust parsed_article_body
 381     return $ Just $ Article title' byline' body'
 382
 383 parse_html_article :: String -> IO (Maybe Article)
 384 parse_html_article html = do
 385   let xml = parseHtml $ wrap_in_body_div html
 386   fp_parse_article xml
 387
 388
 389 -- | In the full page, all of the article titles and bodies are
 390 --   wrapped in one big div.ArticleText.
 391 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 392 parse_bodies xml =
 393   xml >>> css "div.ArticleText"
 394
 395
 396 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 397 fp_parse_articles xml = do
 398   bodies <- runX . xshow $ parse_bodies xml
 399   let article_separator = "<h2 class=\"SummaryHL\">"
 400   let split_articles'' = split article_separator (concat bodies)
 401   -- The first element will contain the crap before the first <h2...>.
 402   let split_articles' = case split_articles'' of
 403                           (_:_) -> tail split_articles''
 404                           []    -> []
 405   -- Put the separator back, it was lost during the split.
 406   let split_articles = map (article_separator ++) split_articles'
 407   --_ <- mapM print_article split_articles
 408   real_articles <- mapM parse_html_article split_articles
 409   let just_articles = catMaybes real_articles
 410   return just_articles
 411
 412
 413 -- | This makes it easy to select otherwise-random chunks of html
 414 --   using 'css'.
 415 wrap_in_body_div :: String -> String
 416 wrap_in_body_div s =
 417   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 418
 419
 420
 421
 422 --
 423 -- Epublishable stuff
 424 --
 425
 426 title :: Page -> String
 427 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 428 title (FullPage hl _) = hl
 429
 430
 431 metadata :: Page -> IO String
 432 metadata obj = do
 433   date <- getCurrentTime
 434   return $
 435     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 436     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 437     "<dc:language>en-US</dc:language>\n" ++
 438     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 439     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 440
 441
 442 epublish :: Page -> Handle -> IO ()
 443 epublish obj handle = do
 444   let xhtml = to_xhtml obj
 445   epmd <- metadata obj
 446   epub <- xhtml_to_epub epmd xhtml
 447   B.hPut handle epub
 448   hFlush handle
 449   hClose handle
 450
 451 xhtml_to_epub :: String -> String -> IO B.ByteString
 452 xhtml_to_epub epmd =
 453    write_epub . read_html
 454    where
 455      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 456      write_epub = writeEPUB Nothing [] my_writer_options
 457      read_html  = readHtml defaultParserState
 458
 459
 460
 461 --
 462 -- Misc
 463 --
 464
 465 image_srcs :: (ArrowXml a) => a XmlTree URL
 466 image_srcs =
 467   css "img"
 468   >>>
 469   getAttrValue "src"
 470
 471 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
 472 make_image_srcs_absolute =
 473   processTopDown (make_srcs_absolute `when` is_image)
 474   where
 475     change_src :: (ArrowXml a) => a XmlTree XmlTree
 476     change_src =
 477       changeAttrValue try_make_absolute_url
 478
 479     make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
 480     make_srcs_absolute =
 481       processAttrl $ change_src `when` hasName "src"
 482
 483
 484 --
 485 -- Tests
 486 --
 487
 488 test_preprocess_links :: Assertion
 489 test_preprocess_links = do
 490   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 491   let actual_xml = actual_xml' !! 0
 492
 493   expected_xml' <- runX $ expected_xml'' >>> css "body"
 494   let expected_xml = expected_xml' !! 0
 495
 496   assertEqual
 497     "Links are replaced with spans"
 498     expected_xml
 499     actual_xml
 500   where
 501     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 502     input_xml  = parseHtml input_html
 503     expected_html = "<body><span>Hello, world!</span></body>"
 504     expected_xml'' = parseHtml expected_html
 505
 506
 507 test_absolve_images :: Assertion
 508 test_absolve_images = do
 509   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 510   let actual_xml = actual_xml' !! 0
 511
 512   expected_xml' <- runX $ expected_xml'' >>> css "body"
 513   let expected_xml = expected_xml' !! 0
 514
 515   assertEqual
 516     "Image srcs are made absolute"
 517     expected_xml
 518     actual_xml
 519   where
 520     input_html =
 521       "<body>" ++
 522       "<img src=\"/images/2012/example.jpg\" />" ++
 523       "</body>"
 524     input_xml  = parseHtml input_html
 525     expected_html =
 526       "<body>" ++
 527       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 528       "</body>"
 529     expected_xml'' = parseHtml expected_html
 530
 531
 532 test_comments_removed :: Assertion
 533 test_comments_removed = do
 534   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 535   let actual_xml = actual_xml' !! 0
 536
 537   expected_xml' <- runX $ expected_xml'' >>> css "body"
 538   let expected_xml = expected_xml' !! 0
 539
 540   assertEqual
 541     "Comment links are removed"
 542     expected_xml
 543     actual_xml
 544   where
 545     input_html =
 546       "<body><p>" ++
 547       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 548       "</p></body>"
 549     input_xml  = parseHtml input_html
 550
 551     expected_html  = "<body><p></p></body>"
 552     expected_xml'' = parseHtml expected_html
 553
 554
 555
 556 page_tests :: Test
 557 page_tests =
 558   testGroup "Page Tests" [
 559     testCase "Links are replaced with spans" test_preprocess_links,
 560     testCase "Image srcs are made absolute" test_absolve_images,
 561     testCase "Comment links are removed" test_comments_removed ]