src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import Control.Concurrent.ParallelIO (parallel)
   7 import qualified Data.Map as Map (lookup)
   8 import Data.Time (getCurrentTime)
   9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  10 import Data.String.Utils (split, strip)
  11 import qualified Data.Map as Map (Map, empty, insert)
  12 import Data.Maybe (catMaybes, fromJust, isNothing)
  13 import Prelude hiding (readFile)
  14 import System.IO (Handle, hClose, hFlush)
  15 import Test.HUnit (Assertion, assertEqual)
  16 import Test.Framework (Test, testGroup)
  17 import Test.Framework.Providers.HUnit (testCase)
  18 import Text.Pandoc (
  19   defaultParserState,
  20   defaultWriterOptions,
  21   readHtml,
  22   writeEPUB,
  23   writerEPUBMetadata,
  24   writerUserDataDir)
  25 import Text.Pandoc.Shared ( readDataFile )
  26 import Text.XML.HXT.Core (
  27   ArrowXml,
  28   IOSArrow,
  29   XmlTree,
  30   ($<),
  31   (>>>),
  32   (/>),
  33   (//>),
  34   changeAttrValue,
  35   getAttrValue,
  36   getChildren,
  37   getText,
  38   hasName,
  39   none,
  40   processAttrl,
  41   processTopDown,
  42   this,
  43   runX,
  44   xshow,
  45   when)
  46 import Text.HandsomeSoup (css, parseHtml)
  47
  48 import Configuration (Cfg, full_stories)
  49 import LWN.Article
  50 import LWN.HTTP (
  51   ImageMap,
  52   download_image_urls,
  53   get_article_contents)
  54 import LWN.URI (URL)
  55 import LWN.XHTML (
  56   XHTML,
  57   full_story_urls,
  58   image_srcs,
  59   full_story_link,
  60   full_story_paragraph,
  61   is_image,
  62   preprocess,
  63   remove_byline,
  64   remove_full_story_paragraphs,
  65   remove_title,
  66   to_xhtml,
  67   to_xml,
  68   xml_from_contents)
  69
  70
  71
  72
  73 data Page =
  74   -- | An LWN page with one article on it.
  75   ArticlePage { article :: Article } |
  76
  77   -- | An LWN page with more than one article on it. These require
  78   --   different parsing and display functions than the single-article
  79   --   pages.
  80   FullPage { headline :: String,
  81              articles :: [Article] }
  82
  83
  84 instance XHTML Page where
  85   to_xhtml (ArticlePage a) =
  86     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  87     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  88     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  89     "<head>" ++
  90     "  <meta http-equiv=\"Content-Type\"" ++
  91     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  92     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  93     "</head>" ++
  94     "<body>" ++
  95     (to_xhtml a) ++
  96     "</body>" ++
  97     "</html>"
  98
  99   to_xhtml (FullPage hl as) =
 100     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
 101     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
 102     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
 103     "<head>" ++
 104     "  <meta http-equiv=\"Content-Type\"" ++
 105     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 106     "  <title>" ++ hl ++ "</title>" ++
 107     "</head>" ++
 108     "<body>" ++
 109     "<div>" ++
 110     "<h1>" ++ hl ++ "</h1>" ++
 111     (concatMap to_xhtml as) ++
 112     "</div>" ++
 113     "</body>" ++
 114     "</html>"
 115
 116
 117
 118 -- | Stolen from writeEPUB.
 119 default_stylesheet :: IO String
 120 default_stylesheet =
 121   -- This comes with Pandoc, I guess.
 122   readDataFile (writerUserDataDir defaultWriterOptions) "epub.css"
 123
 124
 125 construct_stylesheet :: IO String
 126 construct_stylesheet = do
 127   defaults <- default_stylesheet
 128   -- Allow word-wrapping in <pre> elements.
 129   let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
 130   return $ defaults ++ my_additions
 131
 132 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 133 page_from_url cfg url = do
 134   contents <- get_article_contents cfg url
 135   case (xml_from_contents contents) of
 136     Just html -> parse cfg html
 137     Nothing -> return Nothing
 138
 139
 140
 141 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
 142 insert_full_stories story_map =
 143   processTopDown (article_xml `when` full_story_paragraph)
 144   where
 145     lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
 146     lookup_func href =
 147       case Map.lookup href story_map of
 148         -- Drop the paragraph if we don't have the contents.
 149         Nothing -> none
 150         Just v -> to_xml v
 151
 152     article_xml :: (ArrowXml a) => a XmlTree XmlTree
 153     article_xml =
 154       lookup_func
 155       $< -- From HXT's Control.Arrow.ArrowList
 156       (this /> full_story_link >>> getAttrValue "href")
 157
 158 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 159 replace_remote_img_srcs image_map =
 160   processTopDown (make_srcs_local `when` is_image)
 161   where
 162     -- old_src -> new_src
 163     change_src_func :: String -> String
 164     change_src_func old_src =
 165       case Map.lookup old_src image_map of
 166         -- Leave it alone if we don't have the file locally
 167         Nothing -> old_src
 168         Just v -> v
 169
 170     change_src :: (ArrowXml a) => a XmlTree XmlTree
 171     change_src =
 172       changeAttrValue change_src_func
 173
 174     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 175     make_srcs_local =
 176       processAttrl $ (change_src `when` (hasName "src"))
 177
 178
 179
 180
 181 -- Should be called *after* preprocessing.
 182 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 183 download_images xml = do
 184   image_urls <- runX $ xml >>> image_srcs
 185   download_image_urls image_urls
 186
 187
 188
 189 type StoryMap = Map.Map URL Article
 190
 191 -- These come *before* preprocessing.
 192 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
 193 download_full_story_urls cfg story_urls = do
 194   pages <- parallel $ map (page_from_url cfg) story_urls
 195   let pairs = zip story_urls pages
 196   return $ foldl my_insert empty_map pairs
 197   where
 198     empty_map = Map.empty :: StoryMap
 199
 200     my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
 201     my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
 202     my_insert dict (_, _)  = dict
 203
 204
 205 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
 206 download_full_stories cfg xml = do
 207   story_urls <- runX $ xml >>> full_story_urls
 208   download_full_story_urls cfg story_urls
 209
 210
 211 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 212 parse cfg xml = do
 213   fs_xml <- if (full_stories cfg) then do
 214              story_map <- download_full_stories cfg xml
 215              return $ xml >>> insert_full_stories story_map
 216            else do
 217              -- Get rid of them if we don't want them.
 218              return $ xml >>> remove_full_story_paragraphs
 219
 220   let clean_xml = fs_xml >>> preprocess
 221   image_map <- download_images clean_xml
 222   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 223
 224   appr <- ap_parse local_xml
 225   fppr <- fp_parse local_xml
 226   return $
 227     if (isNothing appr) then
 228       fppr
 229     else
 230       appr
 231
 232
 233
 234 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 235 parse_headline xml = do
 236   let element_filter = xml >>> css "div.PageHeadline h1"
 237   let element_text_filter = element_filter /> getText
 238   element_text <- runX element_text_filter
 239   return $
 240     case element_text of
 241       [x] -> Just $ strip x
 242       []  -> Nothing
 243       _   -> error "Found more than one headline."
 244
 245
 246 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 247 parse_byline xml = do
 248   let element_filter = xml >>> css "div.FeatureByLine"
 249   let element_text_filter = element_filter /> getText
 250   element_text <- runX element_text_filter
 251   return $
 252     case element_text of
 253       [x] -> Just $ strip x
 254       []  -> Nothing
 255       _   -> error "Found more than one article byline."
 256
 257
 258 --
 259 -- ArticlePage Stuff
 260 --
 261 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 262 ap_parse xml = do
 263     arts <- ap_parse_articles xml
 264     case arts of
 265       [x] -> return $ Just $ ArticlePage x
 266       _   -> return Nothing
 267
 268
 269 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 270 ap_parse_body xml = do
 271   let element_filter = xml >>> css "div.ArticleText"
 272   let element_html_filter = xshow element_filter
 273   element_html <- runX element_html_filter
 274   return $ case element_html of
 275             [x] -> Just x
 276             []  -> Nothing
 277             _   -> error "Found more than one article."
 278
 279
 280 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 281 ap_parse_articles xml = do
 282   parsed_headline <- parse_headline xml
 283   parsed_byline   <- parse_byline xml
 284   parsed_body     <- ap_parse_body xml
 285
 286   if (isNothing parsed_headline) || (isNothing parsed_body)
 287   then return []
 288   else do
 289     let title'  = Title    $ fromJust parsed_headline
 290     let byline' = Byline     parsed_byline
 291     let body'   = BodyHtml $ fromJust parsed_body
 292
 293     return $ [Article title' byline' body']
 294
 295
 296
 297 --
 298 -- FullPage Stuff
 299 --
 300
 301 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 302 fp_parse xml = do
 303     hl <- parse_headline xml
 304     parsed_articles <- fp_parse_articles xml
 305     return $ case parsed_articles of
 306       [] -> Nothing
 307       x  -> Just $ FullPage (fromJust hl) x
 308
 309
 310
 311 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 312 fp_parse_article_title xml = do
 313   let element_filter = xml >>> css "h2.SummaryHL"
 314   let element_text_filter = element_filter //> getText
 315   element_text <- runX element_text_filter
 316   return $ case element_text of
 317             [x] -> Just $ strip x
 318             []  -> Nothing
 319             _   -> error "Found more than one article title."
 320
 321
 322
 323
 324 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 325 fp_parse_article_body xml = do
 326   -- First, delete the article title and byline.
 327   let clean_xml' = xml >>> remove_title >>> remove_byline
 328   -- The only child of the body element should be a div.lwn-article
 329   -- since we wrapped the article's HTML in that.
 330   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 331   clean_html <- runX . xshow $ clean_xml
 332   return $ case clean_html of
 333             [x] -> Just x
 334             []  -> Nothing
 335             _   -> error "Found more than one article body."
 336
 337 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 338 fp_parse_article xml = do
 339   parsed_article_title    <- fp_parse_article_title xml
 340   parsed_article_byline   <- parse_byline xml
 341   parsed_article_body     <- fp_parse_article_body xml
 342
 343   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 344   then
 345     return Nothing
 346   else do
 347     let title'   = Title    $ fromJust parsed_article_title
 348     let byline'  = Byline     parsed_article_byline
 349     let body'    = BodyHtml $ fromJust parsed_article_body
 350     return $ Just $ Article title' byline' body'
 351
 352 parse_html_article :: String -> IO (Maybe Article)
 353 parse_html_article html = do
 354   let xml = parseHtml $ wrap_in_body_div html
 355   fp_parse_article xml
 356
 357
 358 -- | In the full page, all of the article titles and bodies are
 359 --   wrapped in one big div.ArticleText.
 360 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 361 parse_bodies xml =
 362   xml >>> css "div.ArticleText"
 363
 364
 365 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 366 fp_parse_articles xml = do
 367   bodies <- runX . xshow $ parse_bodies xml
 368   let article_separator = "<h2 class=\"SummaryHL\">"
 369   let split_articles'' = split article_separator (concat bodies)
 370   -- The first element will contain the crap before the first <h2...>.
 371   let split_articles' = case split_articles'' of
 372                           (_:_) -> tail split_articles''
 373                           []    -> []
 374   -- Put the separator back, it was lost during the split.
 375   let split_articles = map (article_separator ++) split_articles'
 376   real_articles <- mapM parse_html_article split_articles
 377   let just_articles = catMaybes real_articles
 378   return just_articles
 379
 380
 381 -- | This makes it easy to select otherwise-random chunks of html
 382 --   using 'css'.
 383 wrap_in_body_div :: String -> String
 384 wrap_in_body_div s =
 385   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 386
 387
 388
 389
 390 --
 391 -- Epublishable stuff
 392 --
 393
 394 title :: Page -> String
 395 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 396 title (FullPage hl _) = hl
 397
 398
 399 metadata :: Page -> IO String
 400 metadata obj = do
 401   date <- getCurrentTime
 402   return $
 403     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 404     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 405     "<dc:language>en-US</dc:language>\n" ++
 406     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 407     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 408
 409
 410 epublish :: Page -> Handle -> IO ()
 411 epublish obj handle = do
 412   let xhtml = to_xhtml obj
 413   epmd <- metadata obj
 414   epub <- xhtml_to_epub epmd xhtml
 415   B.hPut handle epub
 416   hFlush handle
 417   hClose handle
 418
 419 xhtml_to_epub :: String -> String -> IO B.ByteString
 420 xhtml_to_epub epmd xhtml = do
 421   stylesheet <- construct_stylesheet
 422   writeEPUB
 423     (Just stylesheet)
 424     []
 425     my_writer_options
 426     (read_html xhtml)
 427   where
 428     my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 429     read_html  = readHtml defaultParserState
 430
 431
 432 --
 433 -- Tests
 434 --
 435
 436 test_preprocess_links :: Assertion
 437 test_preprocess_links = do
 438   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 439   let actual_xml = actual_xml' !! 0
 440
 441   expected_xml' <- runX $ expected_xml'' >>> css "body"
 442   let expected_xml = expected_xml' !! 0
 443
 444   assertEqual
 445     "Links are replaced with spans"
 446     expected_xml
 447     actual_xml
 448   where
 449     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 450     input_xml  = parseHtml input_html
 451     expected_html = "<body><span>Hello, world!</span></body>"
 452     expected_xml'' = parseHtml expected_html
 453
 454
 455 test_absolve_images :: Assertion
 456 test_absolve_images = do
 457   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 458   let actual_xml = actual_xml' !! 0
 459
 460   expected_xml' <- runX $ expected_xml'' >>> css "body"
 461   let expected_xml = expected_xml' !! 0
 462
 463   assertEqual
 464     "Image srcs are made absolute"
 465     expected_xml
 466     actual_xml
 467   where
 468     input_html =
 469       "<body>" ++
 470       "<img src=\"/images/2012/example.jpg\" />" ++
 471       "</body>"
 472     input_xml  = parseHtml input_html
 473     expected_html =
 474       "<body>" ++
 475       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 476       "</body>"
 477     expected_xml'' = parseHtml expected_html
 478
 479
 480 test_comments_removed :: Assertion
 481 test_comments_removed = do
 482   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 483   let actual_xml = actual_xml' !! 0
 484
 485   expected_xml' <- runX $ expected_xml'' >>> css "body"
 486   let expected_xml = expected_xml' !! 0
 487
 488   assertEqual
 489     "Comment links are removed"
 490     expected_xml
 491     actual_xml
 492   where
 493     input_html =
 494       "<body><p>" ++
 495       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 496       "</p></body>"
 497     input_xml  = parseHtml input_html
 498
 499     expected_html  = "<body><p></p></body>"
 500     expected_xml'' = parseHtml expected_html
 501
 502
 503 test_full_story_urls_parsed :: Assertion
 504 test_full_story_urls_parsed = do
 505   actual <- runX $ actual'
 506
 507   assertEqual
 508     "Full Story URLs are parsed"
 509     expected
 510     actual
 511   where
 512     expected = ["/Articles/500738/", "/Articles/501837/"]
 513
 514     full_story_html =
 515       concat ["<p>",
 516               "<a href=\"/Articles/500738/\">Full Story</a> ",
 517               "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
 518               "<p>",
 519               "<a href=\"/Articles/501837/\">Full Story</a> ",
 520               "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
 521               "<p>"]
 522
 523     xml = parseHtml full_story_html
 524     actual' = xml >>> full_story_urls
 525
 526 page_tests :: Test
 527 page_tests =
 528   testGroup "Page Tests" [
 529     testCase "Links are replaced with spans" test_preprocess_links,
 530     testCase "Image srcs are made absolute" test_absolve_images,
 531     testCase "Comment links are removed" test_comments_removed,
 532     testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]