src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import Control.Concurrent.ParallelIO (parallel)
   7 import qualified Data.Map as Map (lookup)
   8 import Data.Time (getCurrentTime)
   9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  10 import Data.String.Utils (split, strip)
  11 import qualified Data.Map as Map (Map, empty, insert)
  12 import Data.Maybe (catMaybes, fromJust, isNothing)
  13 import Prelude hiding (readFile)
  14 import System.IO (Handle, hClose, hFlush)
  15 import Test.HUnit (Assertion, assertEqual)
  16 import Test.Framework (Test, testGroup)
  17 import Test.Framework.Providers.HUnit (testCase)
  18 import Text.Pandoc (
  19   defaultParserState,
  20   defaultWriterOptions,
  21   readHtml,
  22   writeEPUB,
  23   writerEPUBMetadata,
  24   writerUserDataDir)
  25 import Text.Pandoc.Shared ( readDataFile )
  26 import Text.XML.HXT.Core (
  27   ArrowXml,
  28   IOSArrow,
  29   XmlTree,
  30   ($<),
  31   (>>>),
  32   (/>),
  33   (//>),
  34   changeAttrValue,
  35   getAttrValue,
  36   getChildren,
  37   getText,
  38   hasName,
  39   none,
  40   processAttrl,
  41   processTopDown,
  42   this,
  43   runX,
  44   xshow,
  45   when)
  46 import Text.HandsomeSoup (css, parseHtml)
  47
  48 import Configuration (Cfg, full_stories)
  49 import LWN.Article
  50 import LWN.HTTP (
  51   ImageMap,
  52   download_image_urls,
  53   get_article_contents)
  54 import LWN.URI (URL)
  55 import LWN.XHTML (
  56   XHTML,
  57   full_story_urls,
  58   image_srcs,
  59   full_story_link,
  60   full_story_paragraph,
  61   is_image,
  62   preprocess,
  63   remove_byline,
  64   remove_full_story_paragraphs,
  65   remove_title,
  66   to_xhtml,
  67   to_xml,
  68   xml_from_contents)
  69
  70
  71
  72
  73 data Page =
  74   -- | An LWN page with one article on it.
  75   ArticlePage { article :: Article } |
  76
  77   -- | An LWN page with more than one article on it. These require
  78   --   different parsing and display functions than the single-article
  79   --   pages.
  80   FullPage { headline :: String,
  81              articles :: [Article] }
  82
  83
  84 instance XHTML Page where
  85   to_xhtml (ArticlePage a) =
  86     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  87     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  88     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  89     "<head>" ++
  90     "  <meta http-equiv=\"Content-Type\"" ++
  91     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  92     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  93     "</head>" ++
  94     "<body>" ++
  95     (to_xhtml a) ++
  96     "</body>" ++
  97     "</html>"
  98
  99   to_xhtml (FullPage hl as) =
 100     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
 101     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
 102     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
 103     "<head>" ++
 104     "  <meta http-equiv=\"Content-Type\"" ++
 105     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 106     "  <title>" ++ hl ++ "</title>" ++
 107     "</head>" ++
 108     "<body>" ++
 109     "<div>" ++
 110     "<h1>" ++ hl ++ "</h1>" ++
 111     (concatMap to_xhtml as) ++
 112     "</div>" ++
 113     "</body>" ++
 114     "</html>"
 115
 116
 117
 118 -- | Stolen from writeEPUB.
 119 default_stylesheet :: IO String
 120 default_stylesheet =
 121   -- This comes with Pandoc, I guess.
 122   readDataFile (writerUserDataDir defaultWriterOptions) "epub.css"
 123
 124
 125 construct_stylesheet :: IO String
 126 construct_stylesheet = do
 127   defaults <- default_stylesheet
 128   -- Allow word-wrapping in <pre> elements.
 129   let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
 130   return $ defaults ++ my_additions
 131
 132 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 133 page_from_url cfg url = do
 134   contents <- get_article_contents cfg url
 135   case (xml_from_contents contents) of
 136     Just html -> parse cfg html
 137     Nothing -> return Nothing
 138
 139
 140
 141 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
 142 insert_full_stories story_map =
 143   processTopDown (article_xml `when` full_story_paragraph)
 144   where
 145     lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
 146     lookup_func href =
 147       case Map.lookup href story_map of
 148         -- Drop the paragraph if we don't have the contents.
 149         Nothing -> none
 150         Just v -> to_xml v
 151
 152     article_xml :: (ArrowXml a) => a XmlTree XmlTree
 153     article_xml =
 154       lookup_func
 155       $< -- From HXT's Control.Arrow.ArrowList
 156       (this /> full_story_link >>> getAttrValue "href")
 157
 158 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 159 replace_remote_img_srcs image_map =
 160   processTopDown (make_srcs_local `when` is_image)
 161   where
 162     -- old_src -> new_src
 163     change_src_func :: String -> String
 164     change_src_func old_src =
 165       case Map.lookup old_src image_map of
 166         -- If we don't have the file, empty the src. Pandoc will crash
 167         -- otherwise.
 168         Nothing -> ""
 169         Just v -> v
 170
 171     change_src :: (ArrowXml a) => a XmlTree XmlTree
 172     change_src =
 173       changeAttrValue change_src_func
 174
 175     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 176     make_srcs_local =
 177       processAttrl $ (change_src `when` (hasName "src"))
 178
 179
 180
 181
 182 -- Should be called *after* preprocessing.
 183 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 184 download_images xml = do
 185   image_urls <- runX $ xml >>> image_srcs
 186   download_image_urls image_urls
 187
 188
 189
 190 type StoryMap = Map.Map URL Article
 191
 192 -- These come *before* preprocessing.
 193 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
 194 download_full_story_urls cfg story_urls = do
 195   pages <- parallel $ map (page_from_url cfg) story_urls
 196   let pairs = zip story_urls pages
 197   return $ foldl my_insert empty_map pairs
 198   where
 199     empty_map = Map.empty :: StoryMap
 200
 201     my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
 202     my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
 203     my_insert dict (_, _)  = dict
 204
 205
 206 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
 207 download_full_stories cfg xml = do
 208   story_urls <- runX $ xml >>> full_story_urls
 209   download_full_story_urls cfg story_urls
 210
 211
 212 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 213 parse cfg xml = do
 214   fs_xml <- if (full_stories cfg) then do
 215              story_map <- download_full_stories cfg xml
 216              return $ xml >>> insert_full_stories story_map
 217            else do
 218              -- Get rid of them if we don't want them.
 219              return $ xml >>> remove_full_story_paragraphs
 220
 221   let clean_xml = fs_xml >>> preprocess
 222   image_map <- download_images clean_xml
 223   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 224
 225   appr <- ap_parse local_xml
 226   fppr <- fp_parse local_xml
 227   return $
 228     if (isNothing appr) then
 229       fppr
 230     else
 231       appr
 232
 233
 234
 235 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 236 parse_headline xml = do
 237   let element_filter = xml >>> css "div.PageHeadline h1"
 238   let element_text_filter = element_filter /> getText
 239   element_text <- runX element_text_filter
 240   return $
 241     case element_text of
 242       [x] -> Just $ strip x
 243       []  -> Nothing
 244       _   -> error "Found more than one headline."
 245
 246
 247 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 248 parse_byline xml = do
 249   let element_filter = xml >>> css "div.FeatureByLine"
 250   let element_text_filter = element_filter /> getText
 251   element_text <- runX element_text_filter
 252   return $
 253     case element_text of
 254       [x] -> Just $ strip x
 255       []  -> Nothing
 256       _   -> error "Found more than one article byline."
 257
 258
 259 --
 260 -- ArticlePage Stuff
 261 --
 262 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 263 ap_parse xml = do
 264     arts <- ap_parse_articles xml
 265     case arts of
 266       [x] -> return $ Just $ ArticlePage x
 267       _   -> return Nothing
 268
 269
 270 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 271 ap_parse_body xml = do
 272   let element_filter = xml >>> css "div.ArticleText"
 273   let element_html_filter = xshow element_filter
 274   element_html <- runX element_html_filter
 275   return $ case element_html of
 276             [x] -> Just x
 277             []  -> Nothing
 278             _   -> error "Found more than one article."
 279
 280
 281 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 282 ap_parse_articles xml = do
 283   parsed_headline <- parse_headline xml
 284   parsed_byline   <- parse_byline xml
 285   parsed_body     <- ap_parse_body xml
 286
 287   if (isNothing parsed_headline) || (isNothing parsed_body)
 288   then return []
 289   else do
 290     let title'  = Title    $ fromJust parsed_headline
 291     let byline' = Byline     parsed_byline
 292     let body'   = BodyHtml $ fromJust parsed_body
 293
 294     return $ [Article title' byline' body']
 295
 296
 297
 298 --
 299 -- FullPage Stuff
 300 --
 301
 302 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 303 fp_parse xml = do
 304     hl <- parse_headline xml
 305     parsed_articles <- fp_parse_articles xml
 306     return $ case parsed_articles of
 307       [] -> Nothing
 308       x  -> Just $ FullPage (fromJust hl) x
 309
 310
 311
 312 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 313 fp_parse_article_title xml = do
 314   let element_filter = xml >>> css "h2.SummaryHL"
 315   let element_text_filter = element_filter //> getText
 316   element_text <- runX element_text_filter
 317   return $ case element_text of
 318             [x] -> Just $ strip x
 319             []  -> Nothing
 320             _   -> error "Found more than one article title."
 321
 322
 323
 324
 325 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 326 fp_parse_article_body xml = do
 327   -- First, delete the article title and byline.
 328   let clean_xml' = xml >>> remove_title >>> remove_byline
 329   -- The only child of the body element should be a div.lwn-article
 330   -- since we wrapped the article's HTML in that.
 331   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 332   clean_html <- runX . xshow $ clean_xml
 333   return $ case clean_html of
 334             [x] -> Just x
 335             []  -> Nothing
 336             _   -> error "Found more than one article body."
 337
 338 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 339 fp_parse_article xml = do
 340   parsed_article_title    <- fp_parse_article_title xml
 341   parsed_article_byline   <- parse_byline xml
 342   parsed_article_body     <- fp_parse_article_body xml
 343
 344   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 345   then
 346     return Nothing
 347   else do
 348     let title'   = Title    $ fromJust parsed_article_title
 349     let byline'  = Byline     parsed_article_byline
 350     let body'    = BodyHtml $ fromJust parsed_article_body
 351     return $ Just $ Article title' byline' body'
 352
 353 parse_html_article :: String -> IO (Maybe Article)
 354 parse_html_article html = do
 355   let xml = parseHtml $ wrap_in_body_div html
 356   fp_parse_article xml
 357
 358
 359 -- | In the full page, all of the article titles and bodies are
 360 --   wrapped in one big div.ArticleText.
 361 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 362 parse_bodies xml =
 363   xml >>> css "div.ArticleText"
 364
 365
 366 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 367 fp_parse_articles xml = do
 368   bodies <- runX . xshow $ parse_bodies xml
 369   let article_separator = "<h2 class=\"SummaryHL\">"
 370   let split_articles'' = split article_separator (concat bodies)
 371   -- The first element will contain the crap before the first <h2...>.
 372   let split_articles' = case split_articles'' of
 373                           (_:_) -> tail split_articles''
 374                           []    -> []
 375   -- Put the separator back, it was lost during the split.
 376   let split_articles = map (article_separator ++) split_articles'
 377   real_articles <- mapM parse_html_article split_articles
 378   let just_articles = catMaybes real_articles
 379   return just_articles
 380
 381
 382 -- | This makes it easy to select otherwise-random chunks of html
 383 --   using 'css'.
 384 wrap_in_body_div :: String -> String
 385 wrap_in_body_div s =
 386   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 387
 388
 389
 390
 391 --
 392 -- Epublishable stuff
 393 --
 394
 395 title :: Page -> String
 396 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 397 title (FullPage hl _) = hl
 398
 399
 400 metadata :: Page -> IO String
 401 metadata obj = do
 402   date <- getCurrentTime
 403   return $
 404     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 405     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 406     "<dc:language>en-US</dc:language>\n" ++
 407     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 408     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 409
 410
 411 epublish :: Page -> Handle -> IO ()
 412 epublish obj handle = do
 413   let xhtml = to_xhtml obj
 414   epmd <- metadata obj
 415   epub <- xhtml_to_epub epmd xhtml
 416   B.hPut handle epub
 417   hFlush handle
 418   hClose handle
 419
 420 xhtml_to_epub :: String -> String -> IO B.ByteString
 421 xhtml_to_epub epmd xhtml = do
 422   stylesheet <- construct_stylesheet
 423   writeEPUB
 424     (Just stylesheet)
 425     []
 426     my_writer_options
 427     (read_html xhtml)
 428   where
 429     my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 430     read_html  = readHtml defaultParserState
 431
 432
 433 --
 434 -- Tests
 435 --
 436
 437 test_preprocess_links :: Assertion
 438 test_preprocess_links = do
 439   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 440   let actual_xml = actual_xml' !! 0
 441
 442   expected_xml' <- runX $ expected_xml'' >>> css "body"
 443   let expected_xml = expected_xml' !! 0
 444
 445   assertEqual
 446     "Links are replaced with spans"
 447     expected_xml
 448     actual_xml
 449   where
 450     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 451     input_xml  = parseHtml input_html
 452     expected_html = "<body><span>Hello, world!</span></body>"
 453     expected_xml'' = parseHtml expected_html
 454
 455
 456 test_absolve_images :: Assertion
 457 test_absolve_images = do
 458   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 459   let actual_xml = actual_xml' !! 0
 460
 461   expected_xml' <- runX $ expected_xml'' >>> css "body"
 462   let expected_xml = expected_xml' !! 0
 463
 464   assertEqual
 465     "Image srcs are made absolute"
 466     expected_xml
 467     actual_xml
 468   where
 469     input_html =
 470       "<body>" ++
 471       "<img src=\"/images/2012/example.jpg\" />" ++
 472       "</body>"
 473     input_xml  = parseHtml input_html
 474     expected_html =
 475       "<body>" ++
 476       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 477       "</body>"
 478     expected_xml'' = parseHtml expected_html
 479
 480
 481 test_comments_removed :: Assertion
 482 test_comments_removed = do
 483   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 484   let actual_xml = actual_xml' !! 0
 485
 486   expected_xml' <- runX $ expected_xml'' >>> css "body"
 487   let expected_xml = expected_xml' !! 0
 488
 489   assertEqual
 490     "Comment links are removed"
 491     expected_xml
 492     actual_xml
 493   where
 494     input_html =
 495       "<body><p>" ++
 496       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 497       "</p></body>"
 498     input_xml  = parseHtml input_html
 499
 500     expected_html  = "<body><p></p></body>"
 501     expected_xml'' = parseHtml expected_html
 502
 503
 504 test_full_story_urls_parsed :: Assertion
 505 test_full_story_urls_parsed = do
 506   actual <- runX $ actual'
 507
 508   assertEqual
 509     "Full Story URLs are parsed"
 510     expected
 511     actual
 512   where
 513     expected = ["/Articles/500738/", "/Articles/501837/"]
 514
 515     full_story_html =
 516       concat ["<p>",
 517               "<a href=\"/Articles/500738/\">Full Story</a> ",
 518               "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
 519               "<p>",
 520               "<a href=\"/Articles/501837/\">Full Story</a> ",
 521               "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
 522               "<p>"]
 523
 524     xml = parseHtml full_story_html
 525     actual' = xml >>> full_story_urls
 526
 527 page_tests :: Test
 528 page_tests =
 529   testGroup "Page Tests" [
 530     testCase "Links are replaced with spans" test_preprocess_links,
 531     testCase "Image srcs are made absolute" test_absolve_images,
 532     testCase "Comment links are removed" test_comments_removed,
 533     testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]