src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import qualified Data.Map as Map (lookup)
   7 import Data.Time (getCurrentTime)
   8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
   9 import Data.String.Utils (split, strip)
  10 import Data.Maybe (catMaybes, fromJust, isNothing)
  11 import Prelude hiding (readFile)
  12 import System.Directory (doesFileExist)
  13 import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr)
  14 import System.IO.UTF8 (readFile)
  15 import Test.HUnit (Assertion, assertEqual)
  16 import Test.Framework (Test, testGroup)
  17 import Test.Framework.Providers.HUnit (testCase)
  18 import Text.Pandoc (
  19   defaultParserState,
  20   defaultWriterOptions,
  21   readHtml,
  22   writeEPUB,
  23   writerEPUBMetadata)
  24 import Text.XML.HXT.Core (
  25   ArrowXml,
  26   IOSArrow,
  27   IOStateArrow,
  28   XmlTree,
  29   (>>>),
  30   (/>),
  31   (//>),
  32   changeAttrValue,
  33   getAttrValue,
  34   getChildren,
  35   getText,
  36   hasAttrValue,
  37   hasName,
  38   isElem,
  39   mkName,
  40   none,
  41   processAttrl,
  42   processTopDown,
  43   runX,
  44   setElemName,
  45   xshow,
  46   when)
  47 import Text.HandsomeSoup (css, parseHtml)
  48
  49 import Configuration (Cfg, password, use_account, username)
  50 import LWN.Article
  51 import LWN.HTTP (
  52   ImageMap,
  53   download_image_urls,
  54   get_page,
  55   log_in,
  56   make_cookie_jar)
  57 import LWN.URI (URL, try_make_absolute_url)
  58 import LWN.XHTML (XHTML, parse_lwn, to_xhtml)
  59 import Misc (contains)
  60
  61
  62 -- | Try to parse the given article using HXT. We try a few different
  63 --   methods; if none of them work, we return 'Nothing'.
  64 get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree))
  65 get_xml_from_article cfg article_name = do
  66   my_article <- real_article_path article_name
  67   is_file <- doesFileExist my_article
  68   case is_file of
  69     True -> do
  70       contents <- readFile my_article
  71       return $ Just $ parse_lwn contents
  72     False -> do
  73       -- Download the URL and try to parse it.
  74       if use_account cfg then do
  75         -- use_account would be false if these fromJusts would fail.
  76         cj <- make_cookie_jar
  77         li_result <- log_in cj
  78                       (fromJust $ username cfg)
  79                       (fromJust $ password cfg)
  80
  81         case li_result of
  82           Left err -> do
  83             let msg = "Failed to log in. " ++ err
  84             hPutStrLn stderr msg
  85           Right response_body -> do
  86             hPutStrLn stderr response_body
  87
  88         html <- get_page (Just cj) my_article
  89
  90         case html of
  91           Left err -> do
  92             let msg = "Failed to retrieve page. " ++ err
  93             hPutStrLn stderr msg
  94             return Nothing
  95           Right h -> return $ Just $ parse_lwn h
  96       else do
  97         html <- get_page Nothing my_article
  98         case html of
  99           Left err -> do
 100             let msg = "Failed to retrieve page. " ++ err
 101             hPutStrLn stderr msg
 102             return Nothing
 103           Right h -> return $ Just $ parse_lwn h
 104
 105
 106
 107 -- Should be called *after* preprocessing.
 108 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 109 download_images xml = do
 110   image_urls <- runX $ xml >>> image_srcs
 111   download_image_urls image_urls
 112
 113
 114 data Page =
 115   -- | An LWN page with one article on it.
 116   ArticlePage { article :: Article } |
 117
 118   -- | An LWN page with more than one article on it. These require
 119   --   different parsing and display functions than the single-article
 120   --   pages.
 121   FullPage { headline :: String,
 122              articles :: [Article] }
 123
 124
 125 instance XHTML Page where
 126   to_xhtml (ArticlePage a) =
 127     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
 128     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
 129     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
 130     "<head>" ++
 131     "  <meta http-equiv=\"Content-Type\"" ++
 132     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 133     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
 134     "</head>" ++
 135     "<body>" ++
 136     "<div>" ++
 137     (to_xhtml a) ++
 138     "</div>" ++
 139     "</body>" ++
 140     "</html>"
 141
 142   to_xhtml (FullPage hl as) =
 143     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
 144     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
 145     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
 146     "<head>" ++
 147     "  <meta http-equiv=\"Content-Type\"" ++
 148     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 149     "  <title>" ++ hl ++ "</title>" ++
 150     "</head>" ++
 151     "<body>" ++
 152     "<div>" ++
 153     "<h1>" ++ hl ++ "</h1>" ++
 154     (concatMap to_xhtml as) ++
 155     "</div>" ++
 156     "</body>" ++
 157     "</html>"
 158
 159
 160
 161 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 162 page_from_url cfg url = do
 163   maybe_html <- get_xml_from_article cfg url
 164   case maybe_html of
 165     Just html -> parse html
 166     Nothing -> return Nothing
 167
 168
 169 is_link :: (ArrowXml a) => a XmlTree XmlTree
 170 is_link =
 171   isElem >>> hasName "a"
 172
 173
 174 remove_comment_links  :: (ArrowXml a) => a XmlTree XmlTree
 175 remove_comment_links =
 176   processTopDown $ kill_comments `when` is_link
 177   where
 178     is_comment_link =
 179       hasAttrValue "href" (contains "#Comments")
 180
 181     kill_comments =
 182       none `when` is_comment_link
 183
 184 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
 185 replace_links_with_spans =
 186   processTopDown $ (make_span >>> remove_attrs) `when` is_link
 187   where
 188     make_span    = setElemName $ mkName "span"
 189     remove_attrs = processAttrl none
 190
 191
 192 -- | Preprocessing common to both page types.
 193 preprocess :: (ArrowXml a) => a XmlTree XmlTree
 194 preprocess =
 195   make_image_srcs_absolute
 196   >>>
 197   remove_comment_links
 198   >>>
 199   replace_links_with_spans
 200
 201
 202 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 203 replace_remote_img_srcs image_map =
 204   processTopDown (make_srcs_local `when` is_image)
 205   where
 206     -- old_src -> new_src
 207     change_src_func :: String -> String
 208     change_src_func old_src =
 209       case Map.lookup old_src image_map of
 210         -- Leave it alone if we don't have the file locally
 211         Nothing -> old_src
 212         Just v -> v
 213
 214     change_src :: (ArrowXml a) => a XmlTree XmlTree
 215     change_src =
 216       changeAttrValue change_src_func
 217
 218     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 219     make_srcs_local =
 220       processAttrl $ (change_src `when` (hasName "src"))
 221
 222
 223 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 224 parse xml = do
 225   let clean_xml = xml >>> preprocess
 226   image_map <- download_images clean_xml
 227   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 228   appr <- ap_parse local_xml
 229   fppr <- fp_parse local_xml
 230   return $
 231     if (isNothing appr) then
 232       fppr
 233     else
 234       appr
 235
 236
 237
 238 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 239 parse_headline xml = do
 240   let element_filter = xml >>> css "div.PageHeadline h1"
 241   let element_text_filter = element_filter /> getText
 242   element_text <- runX element_text_filter
 243   return $
 244     case element_text of
 245       [x] -> Just $ strip x
 246       []  -> Nothing
 247       _   -> error "Found more than one headline."
 248
 249
 250 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 251 parse_byline xml = do
 252   let element_filter = xml >>> css "div.FeatureByLine"
 253   let element_text_filter = element_filter /> getText
 254   element_text <- runX element_text_filter
 255   return $
 256     case element_text of
 257       [x] -> Just $ strip x
 258       []  -> Nothing
 259       _   -> error "Found more than one article byline."
 260
 261
 262 --
 263 -- ArticlePage Stuff
 264 --
 265 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 266 ap_parse xml = do
 267     arts <- ap_parse_articles xml
 268     case arts of
 269       [x] -> return $ Just $ ArticlePage x
 270       _   -> return Nothing
 271
 272
 273 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 274 ap_parse_body xml = do
 275   let element_filter = xml >>> css "div.ArticleText"
 276   let element_html_filter = xshow element_filter
 277   element_html <- runX element_html_filter
 278   return $ case element_html of
 279             [x] -> Just x
 280             []  -> Nothing
 281             _   -> error "Found more than one article."
 282
 283
 284 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 285 ap_parse_articles xml = do
 286   parsed_headline <- parse_headline xml
 287   parsed_byline   <- parse_byline xml
 288   parsed_body     <- ap_parse_body xml
 289
 290   putStrLn $ fromJust parsed_headline
 291
 292   if (isNothing parsed_headline) || (isNothing parsed_body)
 293   then return []
 294   else do
 295     let title'  = Title    $ fromJust parsed_headline
 296     let byline' = Byline     parsed_byline
 297     let body'   = BodyHtml $ fromJust parsed_body
 298
 299     return $ [Article title' byline' body']
 300
 301
 302
 303 --
 304 -- FullPage Stuff
 305 --
 306
 307 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 308 fp_parse xml = do
 309     hl <- parse_headline xml
 310     parsed_articles <- fp_parse_articles xml
 311     case parsed_articles of
 312       []          -> return Nothing
 313       x -> return $ Just $ FullPage (fromJust hl) x
 314
 315
 316
 317 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 318 fp_parse_article_title xml = do
 319   let element_filter = xml >>> css "h2.SummaryHL"
 320   let element_text_filter = element_filter //> getText
 321   element_text <- runX element_text_filter
 322   return $ case element_text of
 323             [x] -> Just $ strip x
 324             []  -> Nothing
 325             _   -> error "Found more than one article title."
 326
 327
 328
 329 is_title :: (ArrowXml a) => a XmlTree XmlTree
 330 is_title =
 331   (hasName "h2")
 332   >>>
 333   (hasAttrValue "class" (== "SummaryHL"))
 334
 335
 336 is_byline :: (ArrowXml a) => a XmlTree XmlTree
 337 is_byline =
 338   (hasName "div")
 339   >>>
 340   (hasAttrValue "class" (== "FeatureByLine"))
 341
 342
 343 is_image :: (ArrowXml a) => a XmlTree XmlTree
 344 is_image = isElem >>> hasName "img"
 345
 346 remove_title :: (ArrowXml a) => a XmlTree XmlTree
 347 remove_title =
 348   processTopDown ((none) `when` is_title)
 349
 350
 351 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
 352 remove_byline =
 353   processTopDown ((none) `when` is_byline)
 354
 355
 356
 357 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 358 fp_parse_article_body xml = do
 359   -- First, delete the article title and byline.
 360   let clean_xml' = xml >>> remove_title >>> remove_byline
 361   -- The only child of the body element should be a div.lwn-article
 362   -- since we wrapped the article's HTML in that.
 363   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 364   clean_html <- runX . xshow $ clean_xml
 365   return $ case clean_html of
 366             [x] -> Just x
 367             []  -> Nothing
 368             _   -> error "Found more than one article body."
 369
 370 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 371 fp_parse_article xml = do
 372   parsed_article_title    <- fp_parse_article_title xml
 373   parsed_article_byline   <- parse_byline xml
 374   parsed_article_body     <- fp_parse_article_body xml
 375
 376   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 377   then
 378     return Nothing
 379   else do
 380     let title'   = Title    $ fromJust parsed_article_title
 381     let byline'  = Byline     parsed_article_byline
 382     let body'    = BodyHtml $ fromJust parsed_article_body
 383     return $ Just $ Article title' byline' body'
 384
 385 parse_html_article :: String -> IO (Maybe Article)
 386 parse_html_article html = do
 387   let xml = parseHtml $ wrap_in_body_div html
 388   fp_parse_article xml
 389
 390
 391 -- | In the full page, all of the article titles and bodies are
 392 --   wrapped in one big div.ArticleText.
 393 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 394 parse_bodies xml =
 395   xml >>> css "div.ArticleText"
 396
 397
 398 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 399 fp_parse_articles xml = do
 400   bodies <- runX . xshow $ parse_bodies xml
 401   let article_separator = "<h2 class=\"SummaryHL\">"
 402   let split_articles'' = split article_separator (concat bodies)
 403   -- The first element will contain the crap before the first <h2...>.
 404   let split_articles' = case split_articles'' of
 405                           (_:_) -> tail split_articles''
 406                           []    -> []
 407   -- Put the separator back, it was lost during the split.
 408   let split_articles = map (article_separator ++) split_articles'
 409   --_ <- mapM print_article split_articles
 410   real_articles <- mapM parse_html_article split_articles
 411   let just_articles = catMaybes real_articles
 412   return just_articles
 413
 414
 415 -- | This makes it easy to select otherwise-random chunks of html
 416 --   using 'css'.
 417 wrap_in_body_div :: String -> String
 418 wrap_in_body_div s =
 419   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 420
 421
 422
 423
 424 --
 425 -- Epublishable stuff
 426 --
 427
 428 title :: Page -> String
 429 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 430 title (FullPage hl _) = hl
 431
 432
 433 metadata :: Page -> IO String
 434 metadata obj = do
 435   date <- getCurrentTime
 436   return $
 437     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 438     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 439     "<dc:language>en-US</dc:language>\n" ++
 440     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 441     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 442
 443
 444 epublish :: Page -> Handle -> IO ()
 445 epublish obj handle = do
 446   let xhtml = to_xhtml obj
 447   epmd <- metadata obj
 448   epub <- xhtml_to_epub epmd xhtml
 449   B.hPut handle epub
 450   hFlush handle
 451   hClose handle
 452
 453 xhtml_to_epub :: String -> String -> IO B.ByteString
 454 xhtml_to_epub epmd =
 455    write_epub . read_html
 456    where
 457      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 458      write_epub = writeEPUB Nothing [] my_writer_options
 459      read_html  = readHtml defaultParserState
 460
 461
 462
 463 --
 464 -- Misc
 465 --
 466
 467 image_srcs :: (ArrowXml a) => a XmlTree URL
 468 image_srcs =
 469   css "img"
 470   >>>
 471   getAttrValue "src"
 472
 473 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
 474 make_image_srcs_absolute =
 475   processTopDown (make_srcs_absolute `when` is_image)
 476   where
 477     change_src :: (ArrowXml a) => a XmlTree XmlTree
 478     change_src =
 479       changeAttrValue try_make_absolute_url
 480
 481     make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
 482     make_srcs_absolute =
 483       processAttrl $ change_src `when` hasName "src"
 484
 485
 486 --
 487 -- Tests
 488 --
 489
 490 test_preprocess_links :: Assertion
 491 test_preprocess_links = do
 492   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 493   let actual_xml = actual_xml' !! 0
 494
 495   expected_xml' <- runX $ expected_xml'' >>> css "body"
 496   let expected_xml = expected_xml' !! 0
 497
 498   assertEqual
 499     "Links are replaced with spans"
 500     expected_xml
 501     actual_xml
 502   where
 503     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 504     input_xml  = parseHtml input_html
 505     expected_html = "<body><span>Hello, world!</span></body>"
 506     expected_xml'' = parseHtml expected_html
 507
 508
 509 test_absolve_images :: Assertion
 510 test_absolve_images = do
 511   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 512   let actual_xml = actual_xml' !! 0
 513
 514   expected_xml' <- runX $ expected_xml'' >>> css "body"
 515   let expected_xml = expected_xml' !! 0
 516
 517   assertEqual
 518     "Image srcs are made absolute"
 519     expected_xml
 520     actual_xml
 521   where
 522     input_html =
 523       "<body>" ++
 524       "<img src=\"/images/2012/example.jpg\" />" ++
 525       "</body>"
 526     input_xml  = parseHtml input_html
 527     expected_html =
 528       "<body>" ++
 529       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 530       "</body>"
 531     expected_xml'' = parseHtml expected_html
 532
 533
 534 test_comments_removed :: Assertion
 535 test_comments_removed = do
 536   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 537   let actual_xml = actual_xml' !! 0
 538
 539   expected_xml' <- runX $ expected_xml'' >>> css "body"
 540   let expected_xml = expected_xml' !! 0
 541
 542   assertEqual
 543     "Comment links are removed"
 544     expected_xml
 545     actual_xml
 546   where
 547     input_html =
 548       "<body><p>" ++
 549       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 550       "</p></body>"
 551     input_xml  = parseHtml input_html
 552
 553     expected_html  = "<body><p></p></body>"
 554     expected_xml'' = parseHtml expected_html
 555
 556
 557
 558 page_tests :: Test
 559 page_tests =
 560   testGroup "Page Tests" [
 561     testCase "Links are replaced with spans" test_preprocess_links,
 562     testCase "Image srcs are made absolute" test_absolve_images,
 563     testCase "Comment links are removed" test_comments_removed ]