src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import Control.Concurrent.ParallelIO (parallel)
   7 import qualified Data.Map as Map (lookup)
   8 import Data.Time (getCurrentTime)
   9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  10 import Data.String.Utils (split, strip)
  11 import qualified Data.Map as Map (Map, empty, insert)
  12 import Data.Maybe (catMaybes, fromJust, isNothing)
  13 import Prelude hiding (readFile)
  14 import System.IO (Handle, hClose, hFlush)
  15 import Test.HUnit (Assertion, assertEqual)
  16 import Test.Framework (Test, testGroup)
  17 import Test.Framework.Providers.HUnit (testCase)
  18 import Text.Pandoc (
  19   defaultParserState,
  20   defaultWriterOptions,
  21   readHtml,
  22   writeEPUB,
  23   writerEPUBMetadata)
  24 import Text.XML.HXT.Core (
  25   ArrowXml,
  26   IOSArrow,
  27   XmlTree,
  28   ($<),
  29   (>>>),
  30   (/>),
  31   (//>),
  32   changeAttrValue,
  33   getAttrValue,
  34   getChildren,
  35   getText,
  36   hasName,
  37   processAttrl,
  38   processTopDown,
  39   this,
  40   runX,
  41   xshow,
  42   when)
  43 import Text.HandsomeSoup (css, parseHtml)
  44
  45 import Configuration (Cfg, full_stories)
  46 import LWN.Article
  47 import LWN.HTTP (
  48   ImageMap,
  49   download_image_urls,
  50   get_article_contents)
  51 import LWN.URI (URL)
  52 import LWN.XHTML (
  53   XHTML,
  54   full_story_urls,
  55   image_srcs,
  56   full_story_link,
  57   full_story_paragraph,
  58   is_image,
  59   preprocess,
  60   remove_byline,
  61   remove_full_story_paragraphs,
  62   remove_title,
  63   to_xhtml,
  64   to_xml,
  65   xml_from_contents)
  66
  67
  68
  69
  70 data Page =
  71   -- | An LWN page with one article on it.
  72   ArticlePage { article :: Article } |
  73
  74   -- | An LWN page with more than one article on it. These require
  75   --   different parsing and display functions than the single-article
  76   --   pages.
  77   FullPage { headline :: String,
  78              articles :: [Article] }
  79
  80
  81 instance XHTML Page where
  82   to_xhtml (ArticlePage a) =
  83     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  84     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  85     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  86     "<head>" ++
  87     "  <meta http-equiv=\"Content-Type\"" ++
  88     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  89     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  90     "</head>" ++
  91     "<body>" ++
  92     (to_xhtml a) ++
  93     "</body>" ++
  94     "</html>"
  95
  96   to_xhtml (FullPage hl as) =
  97     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  98     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  99     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
 100     "<head>" ++
 101     "  <meta http-equiv=\"Content-Type\"" ++
 102     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 103     "  <title>" ++ hl ++ "</title>" ++
 104     "</head>" ++
 105     "<body>" ++
 106     "<div>" ++
 107     "<h1>" ++ hl ++ "</h1>" ++
 108     (concatMap to_xhtml as) ++
 109     "</div>" ++
 110     "</body>" ++
 111     "</html>"
 112
 113
 114
 115 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 116 page_from_url cfg url = do
 117   contents <- get_article_contents cfg url
 118   case (xml_from_contents contents) of
 119     Just html -> parse cfg html
 120     Nothing -> return Nothing
 121
 122
 123
 124 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
 125 insert_full_stories story_map =
 126   processTopDown (article_xml `when` full_story_paragraph)
 127   where
 128     lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
 129     lookup_func href =
 130       case Map.lookup href story_map of
 131         -- Leave it alone if we don't have the full story.
 132         Nothing -> this
 133         Just v -> to_xml v
 134
 135     article_xml :: (ArrowXml a) => a XmlTree XmlTree
 136     article_xml =
 137       lookup_func
 138       $< -- From HXT's Control.Arrow.ArrowList
 139       (this /> full_story_link >>> getAttrValue "href")
 140
 141 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 142 replace_remote_img_srcs image_map =
 143   processTopDown (make_srcs_local `when` is_image)
 144   where
 145     -- old_src -> new_src
 146     change_src_func :: String -> String
 147     change_src_func old_src =
 148       case Map.lookup old_src image_map of
 149         -- Leave it alone if we don't have the file locally
 150         Nothing -> old_src
 151         Just v -> v
 152
 153     change_src :: (ArrowXml a) => a XmlTree XmlTree
 154     change_src =
 155       changeAttrValue change_src_func
 156
 157     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 158     make_srcs_local =
 159       processAttrl $ (change_src `when` (hasName "src"))
 160
 161
 162
 163
 164 -- Should be called *after* preprocessing.
 165 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 166 download_images xml = do
 167   image_urls <- runX $ xml >>> image_srcs
 168   download_image_urls image_urls
 169
 170
 171
 172 type StoryMap = Map.Map URL Article
 173
 174 -- These come *before* preprocessing.
 175 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
 176 download_full_story_urls cfg story_urls = do
 177   pages <- parallel $ map (page_from_url cfg) story_urls
 178   let pairs = zip story_urls pages
 179   return $ foldl my_insert empty_map pairs
 180   where
 181     empty_map = Map.empty :: StoryMap
 182
 183     my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
 184     my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
 185     my_insert dict (_, _)  = dict
 186
 187
 188 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
 189 download_full_stories cfg xml = do
 190   story_urls <- runX $ xml >>> full_story_urls
 191   download_full_story_urls cfg story_urls
 192
 193
 194 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 195 parse cfg xml = do
 196   fs_xml <- if (full_stories cfg) then do
 197              story_map <- download_full_stories cfg xml
 198              return $ xml >>> insert_full_stories story_map
 199            else do
 200              -- Get rid of them if we don't want them.
 201              return $ xml >>> remove_full_story_paragraphs
 202
 203   let clean_xml = fs_xml >>> preprocess
 204   image_map <- download_images clean_xml
 205   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 206
 207   appr <- ap_parse local_xml
 208   fppr <- fp_parse local_xml
 209   return $
 210     if (isNothing appr) then
 211       fppr
 212     else
 213       appr
 214
 215
 216
 217 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 218 parse_headline xml = do
 219   let element_filter = xml >>> css "div.PageHeadline h1"
 220   let element_text_filter = element_filter /> getText
 221   element_text <- runX element_text_filter
 222   return $
 223     case element_text of
 224       [x] -> Just $ strip x
 225       []  -> Nothing
 226       _   -> error "Found more than one headline."
 227
 228
 229 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 230 parse_byline xml = do
 231   let element_filter = xml >>> css "div.FeatureByLine"
 232   let element_text_filter = element_filter /> getText
 233   element_text <- runX element_text_filter
 234   return $
 235     case element_text of
 236       [x] -> Just $ strip x
 237       []  -> Nothing
 238       _   -> error "Found more than one article byline."
 239
 240
 241 --
 242 -- ArticlePage Stuff
 243 --
 244 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 245 ap_parse xml = do
 246     arts <- ap_parse_articles xml
 247     case arts of
 248       [x] -> return $ Just $ ArticlePage x
 249       _   -> return Nothing
 250
 251
 252 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 253 ap_parse_body xml = do
 254   let element_filter = xml >>> css "div.ArticleText"
 255   let element_html_filter = xshow element_filter
 256   element_html <- runX element_html_filter
 257   return $ case element_html of
 258             [x] -> Just x
 259             []  -> Nothing
 260             _   -> error "Found more than one article."
 261
 262
 263 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 264 ap_parse_articles xml = do
 265   parsed_headline <- parse_headline xml
 266   parsed_byline   <- parse_byline xml
 267   parsed_body     <- ap_parse_body xml
 268
 269   if (isNothing parsed_headline) || (isNothing parsed_body)
 270   then return []
 271   else do
 272     let title'  = Title    $ fromJust parsed_headline
 273     let byline' = Byline     parsed_byline
 274     let body'   = BodyHtml $ fromJust parsed_body
 275
 276     return $ [Article title' byline' body']
 277
 278
 279
 280 --
 281 -- FullPage Stuff
 282 --
 283
 284 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 285 fp_parse xml = do
 286     hl <- parse_headline xml
 287     parsed_articles <- fp_parse_articles xml
 288     case parsed_articles of
 289       []          -> return Nothing
 290       x -> return $ Just $ FullPage (fromJust hl) x
 291
 292
 293
 294 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 295 fp_parse_article_title xml = do
 296   let element_filter = xml >>> css "h2.SummaryHL"
 297   let element_text_filter = element_filter //> getText
 298   element_text <- runX element_text_filter
 299   return $ case element_text of
 300             [x] -> Just $ strip x
 301             []  -> Nothing
 302             _   -> error "Found more than one article title."
 303
 304
 305
 306
 307 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 308 fp_parse_article_body xml = do
 309   -- First, delete the article title and byline.
 310   let clean_xml' = xml >>> remove_title >>> remove_byline
 311   -- The only child of the body element should be a div.lwn-article
 312   -- since we wrapped the article's HTML in that.
 313   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 314   clean_html <- runX . xshow $ clean_xml
 315   return $ case clean_html of
 316             [x] -> Just x
 317             []  -> Nothing
 318             _   -> error "Found more than one article body."
 319
 320 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 321 fp_parse_article xml = do
 322   parsed_article_title    <- fp_parse_article_title xml
 323   parsed_article_byline   <- parse_byline xml
 324   parsed_article_body     <- fp_parse_article_body xml
 325
 326   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 327   then
 328     return Nothing
 329   else do
 330     let title'   = Title    $ fromJust parsed_article_title
 331     let byline'  = Byline     parsed_article_byline
 332     let body'    = BodyHtml $ fromJust parsed_article_body
 333     return $ Just $ Article title' byline' body'
 334
 335 parse_html_article :: String -> IO (Maybe Article)
 336 parse_html_article html = do
 337   let xml = parseHtml $ wrap_in_body_div html
 338   fp_parse_article xml
 339
 340
 341 -- | In the full page, all of the article titles and bodies are
 342 --   wrapped in one big div.ArticleText.
 343 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 344 parse_bodies xml =
 345   xml >>> css "div.ArticleText"
 346
 347
 348 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 349 fp_parse_articles xml = do
 350   bodies <- runX . xshow $ parse_bodies xml
 351   let article_separator = "<h2 class=\"SummaryHL\">"
 352   let split_articles'' = split article_separator (concat bodies)
 353   -- The first element will contain the crap before the first <h2...>.
 354   let split_articles' = case split_articles'' of
 355                           (_:_) -> tail split_articles''
 356                           []    -> []
 357   -- Put the separator back, it was lost during the split.
 358   let split_articles = map (article_separator ++) split_articles'
 359   real_articles <- mapM parse_html_article split_articles
 360   let just_articles = catMaybes real_articles
 361   return just_articles
 362
 363
 364 -- | This makes it easy to select otherwise-random chunks of html
 365 --   using 'css'.
 366 wrap_in_body_div :: String -> String
 367 wrap_in_body_div s =
 368   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 369
 370
 371
 372
 373 --
 374 -- Epublishable stuff
 375 --
 376
 377 title :: Page -> String
 378 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 379 title (FullPage hl _) = hl
 380
 381
 382 metadata :: Page -> IO String
 383 metadata obj = do
 384   date <- getCurrentTime
 385   return $
 386     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 387     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 388     "<dc:language>en-US</dc:language>\n" ++
 389     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 390     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 391
 392
 393 epublish :: Page -> Handle -> IO ()
 394 epublish obj handle = do
 395   let xhtml = to_xhtml obj
 396   epmd <- metadata obj
 397   epub <- xhtml_to_epub epmd xhtml
 398   B.hPut handle epub
 399   hFlush handle
 400   hClose handle
 401
 402 xhtml_to_epub :: String -> String -> IO B.ByteString
 403 xhtml_to_epub epmd =
 404    write_epub . read_html
 405    where
 406      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 407      write_epub = writeEPUB Nothing [] my_writer_options
 408      read_html  = readHtml defaultParserState
 409
 410
 411 --
 412 -- Tests
 413 --
 414
 415 test_preprocess_links :: Assertion
 416 test_preprocess_links = do
 417   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 418   let actual_xml = actual_xml' !! 0
 419
 420   expected_xml' <- runX $ expected_xml'' >>> css "body"
 421   let expected_xml = expected_xml' !! 0
 422
 423   assertEqual
 424     "Links are replaced with spans"
 425     expected_xml
 426     actual_xml
 427   where
 428     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 429     input_xml  = parseHtml input_html
 430     expected_html = "<body><span>Hello, world!</span></body>"
 431     expected_xml'' = parseHtml expected_html
 432
 433
 434 test_absolve_images :: Assertion
 435 test_absolve_images = do
 436   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 437   let actual_xml = actual_xml' !! 0
 438
 439   expected_xml' <- runX $ expected_xml'' >>> css "body"
 440   let expected_xml = expected_xml' !! 0
 441
 442   assertEqual
 443     "Image srcs are made absolute"
 444     expected_xml
 445     actual_xml
 446   where
 447     input_html =
 448       "<body>" ++
 449       "<img src=\"/images/2012/example.jpg\" />" ++
 450       "</body>"
 451     input_xml  = parseHtml input_html
 452     expected_html =
 453       "<body>" ++
 454       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 455       "</body>"
 456     expected_xml'' = parseHtml expected_html
 457
 458
 459 test_comments_removed :: Assertion
 460 test_comments_removed = do
 461   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 462   let actual_xml = actual_xml' !! 0
 463
 464   expected_xml' <- runX $ expected_xml'' >>> css "body"
 465   let expected_xml = expected_xml' !! 0
 466
 467   assertEqual
 468     "Comment links are removed"
 469     expected_xml
 470     actual_xml
 471   where
 472     input_html =
 473       "<body><p>" ++
 474       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 475       "</p></body>"
 476     input_xml  = parseHtml input_html
 477
 478     expected_html  = "<body><p></p></body>"
 479     expected_xml'' = parseHtml expected_html
 480
 481
 482 test_full_story_urls_parsed :: Assertion
 483 test_full_story_urls_parsed = do
 484   actual <- runX $ actual'
 485
 486   assertEqual
 487     "Full Story URLs are parsed"
 488     expected
 489     actual
 490   where
 491     expected = ["/Articles/500738/", "/Articles/501837/"]
 492
 493     full_story_html =
 494       concat ["<p>",
 495               "<a href=\"/Articles/500738/\">Full Story</a> ",
 496               "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
 497               "<p>",
 498               "<a href=\"/Articles/501837/\">Full Story</a> ",
 499               "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
 500               "<p>"]
 501
 502     xml = parseHtml full_story_html
 503     actual' = xml >>> full_story_urls
 504
 505 page_tests :: Test
 506 page_tests =
 507   testGroup "Page Tests" [
 508     testCase "Links are replaced with spans" test_preprocess_links,
 509     testCase "Image srcs are made absolute" test_absolve_images,
 510     testCase "Comment links are removed" test_comments_removed,
 511     testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]