src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import Control.Concurrent.ParallelIO (parallel)
   7 import qualified Data.Map as Map (lookup)
   8 import Data.Time (getCurrentTime)
   9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  10 import Data.String.Utils (split, strip)
  11 import qualified Data.Map as Map (Map, empty, insert)
  12 import Data.Maybe (catMaybes, fromJust, isNothing)
  13 import Prelude hiding (readFile)
  14 import System.IO (Handle, hClose, hFlush)
  15 import Test.HUnit (Assertion, assertEqual)
  16 import Test.Framework (Test, testGroup)
  17 import Test.Framework.Providers.HUnit (testCase)
  18 import Text.Pandoc (
  19   defaultParserState,
  20   defaultWriterOptions,
  21   readHtml,
  22   writeEPUB,
  23   writerEPUBMetadata)
  24 import Text.XML.HXT.Core (
  25   ArrowXml,
  26   IOSArrow,
  27   XmlTree,
  28   ($<),
  29   (>>>),
  30   (/>),
  31   (//>),
  32   changeAttrValue,
  33   getAttrValue,
  34   getChildren,
  35   getText,
  36   hasName,
  37   processAttrl,
  38   processTopDown,
  39   this,
  40   runX,
  41   xshow,
  42   when)
  43 import Text.HandsomeSoup (css, parseHtml)
  44
  45 import Configuration (Cfg)
  46 import LWN.Article
  47 import LWN.HTTP (
  48   ImageMap,
  49   download_image_urls,
  50   get_article_contents)
  51 import LWN.URI (URL)
  52 import LWN.XHTML (
  53   XHTML,
  54   full_story_urls,
  55   image_srcs,
  56   full_story_link,
  57   full_story_paragraph,
  58   is_image,
  59   preprocess,
  60   remove_byline,
  61   remove_title,
  62   to_xhtml,
  63   to_xml,
  64   xml_from_contents)
  65
  66
  67
  68
  69 data Page =
  70   -- | An LWN page with one article on it.
  71   ArticlePage { article :: Article } |
  72
  73   -- | An LWN page with more than one article on it. These require
  74   --   different parsing and display functions than the single-article
  75   --   pages.
  76   FullPage { headline :: String,
  77              articles :: [Article] }
  78
  79
  80 instance XHTML Page where
  81   to_xhtml (ArticlePage a) =
  82     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  83     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  84     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  85     "<head>" ++
  86     "  <meta http-equiv=\"Content-Type\"" ++
  87     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  88     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  89     "</head>" ++
  90     "<body>" ++
  91     (to_xhtml a) ++
  92     "</body>" ++
  93     "</html>"
  94
  95   to_xhtml (FullPage hl as) =
  96     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  97     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  98     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  99     "<head>" ++
 100     "  <meta http-equiv=\"Content-Type\"" ++
 101     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 102     "  <title>" ++ hl ++ "</title>" ++
 103     "</head>" ++
 104     "<body>" ++
 105     "<div>" ++
 106     "<h1>" ++ hl ++ "</h1>" ++
 107     (concatMap to_xhtml as) ++
 108     "</div>" ++
 109     "</body>" ++
 110     "</html>"
 111
 112
 113
 114 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 115 page_from_url cfg url = do
 116   contents <- get_article_contents cfg url
 117   case (xml_from_contents contents) of
 118     Just html -> parse cfg html
 119     Nothing -> return Nothing
 120
 121
 122
 123 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
 124 insert_full_stories story_map =
 125   processTopDown (article_xml `when` full_story_paragraph)
 126   where
 127     lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
 128     lookup_func href =
 129       case Map.lookup href story_map of
 130         -- Leave it alone if we don't have the full story.
 131         Nothing -> this
 132         Just v -> to_xml v
 133
 134     article_xml :: (ArrowXml a) => a XmlTree XmlTree
 135     article_xml =
 136       lookup_func
 137       $<
 138       (this /> full_story_link >>> getAttrValue "href")
 139
 140 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 141 replace_remote_img_srcs image_map =
 142   processTopDown (make_srcs_local `when` is_image)
 143   where
 144     -- old_src -> new_src
 145     change_src_func :: String -> String
 146     change_src_func old_src =
 147       case Map.lookup old_src image_map of
 148         -- Leave it alone if we don't have the file locally
 149         Nothing -> old_src
 150         Just v -> v
 151
 152     change_src :: (ArrowXml a) => a XmlTree XmlTree
 153     change_src =
 154       changeAttrValue change_src_func
 155
 156     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 157     make_srcs_local =
 158       processAttrl $ (change_src `when` (hasName "src"))
 159
 160
 161
 162
 163 -- Should be called *after* preprocessing.
 164 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 165 download_images xml = do
 166   image_urls <- runX $ xml >>> image_srcs
 167   download_image_urls image_urls
 168
 169
 170
 171 type StoryMap = Map.Map URL Article
 172
 173 -- These come *before* preprocessing.
 174 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
 175 download_full_story_urls cfg story_urls = do
 176   pages <- parallel $ map (page_from_url cfg) story_urls
 177   let pairs = zip story_urls pages
 178   return $ foldl my_insert empty_map pairs
 179   where
 180     empty_map = Map.empty :: StoryMap
 181
 182     my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
 183     my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
 184     my_insert dict (_, _)  = dict
 185
 186
 187 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
 188 download_full_stories cfg xml = do
 189   story_urls <- runX $ xml >>> full_story_urls
 190   download_full_story_urls cfg story_urls
 191
 192
 193 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 194 parse cfg xml = do
 195   story_map <- download_full_stories cfg xml
 196   let fs_xml = xml >>> insert_full_stories story_map
 197
 198   let clean_xml = fs_xml >>> preprocess
 199   image_map <- download_images clean_xml
 200   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 201
 202   appr <- ap_parse local_xml
 203   fppr <- fp_parse local_xml
 204   return $
 205     if (isNothing appr) then
 206       fppr
 207     else
 208       appr
 209
 210
 211
 212 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 213 parse_headline xml = do
 214   let element_filter = xml >>> css "div.PageHeadline h1"
 215   let element_text_filter = element_filter /> getText
 216   element_text <- runX element_text_filter
 217   return $
 218     case element_text of
 219       [x] -> Just $ strip x
 220       []  -> Nothing
 221       _   -> error "Found more than one headline."
 222
 223
 224 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 225 parse_byline xml = do
 226   let element_filter = xml >>> css "div.FeatureByLine"
 227   let element_text_filter = element_filter /> getText
 228   element_text <- runX element_text_filter
 229   return $
 230     case element_text of
 231       [x] -> Just $ strip x
 232       []  -> Nothing
 233       _   -> error "Found more than one article byline."
 234
 235
 236 --
 237 -- ArticlePage Stuff
 238 --
 239 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 240 ap_parse xml = do
 241     arts <- ap_parse_articles xml
 242     case arts of
 243       [x] -> return $ Just $ ArticlePage x
 244       _   -> return Nothing
 245
 246
 247 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 248 ap_parse_body xml = do
 249   let element_filter = xml >>> css "div.ArticleText"
 250   let element_html_filter = xshow element_filter
 251   element_html <- runX element_html_filter
 252   return $ case element_html of
 253             [x] -> Just x
 254             []  -> Nothing
 255             _   -> error "Found more than one article."
 256
 257
 258 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 259 ap_parse_articles xml = do
 260   parsed_headline <- parse_headline xml
 261   parsed_byline   <- parse_byline xml
 262   parsed_body     <- ap_parse_body xml
 263
 264   if (isNothing parsed_headline) || (isNothing parsed_body)
 265   then return []
 266   else do
 267     let title'  = Title    $ fromJust parsed_headline
 268     let byline' = Byline     parsed_byline
 269     let body'   = BodyHtml $ fromJust parsed_body
 270
 271     return $ [Article title' byline' body']
 272
 273
 274
 275 --
 276 -- FullPage Stuff
 277 --
 278
 279 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 280 fp_parse xml = do
 281     hl <- parse_headline xml
 282     parsed_articles <- fp_parse_articles xml
 283     case parsed_articles of
 284       []          -> return Nothing
 285       x -> return $ Just $ FullPage (fromJust hl) x
 286
 287
 288
 289 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 290 fp_parse_article_title xml = do
 291   let element_filter = xml >>> css "h2.SummaryHL"
 292   let element_text_filter = element_filter //> getText
 293   element_text <- runX element_text_filter
 294   return $ case element_text of
 295             [x] -> Just $ strip x
 296             []  -> Nothing
 297             _   -> error "Found more than one article title."
 298
 299
 300
 301
 302 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 303 fp_parse_article_body xml = do
 304   -- First, delete the article title and byline.
 305   let clean_xml' = xml >>> remove_title >>> remove_byline
 306   -- The only child of the body element should be a div.lwn-article
 307   -- since we wrapped the article's HTML in that.
 308   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 309   clean_html <- runX . xshow $ clean_xml
 310   return $ case clean_html of
 311             [x] -> Just x
 312             []  -> Nothing
 313             _   -> error "Found more than one article body."
 314
 315 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 316 fp_parse_article xml = do
 317   parsed_article_title    <- fp_parse_article_title xml
 318   parsed_article_byline   <- parse_byline xml
 319   parsed_article_body     <- fp_parse_article_body xml
 320
 321   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 322   then
 323     return Nothing
 324   else do
 325     let title'   = Title    $ fromJust parsed_article_title
 326     let byline'  = Byline     parsed_article_byline
 327     let body'    = BodyHtml $ fromJust parsed_article_body
 328     return $ Just $ Article title' byline' body'
 329
 330 parse_html_article :: String -> IO (Maybe Article)
 331 parse_html_article html = do
 332   let xml = parseHtml $ wrap_in_body_div html
 333   fp_parse_article xml
 334
 335
 336 -- | In the full page, all of the article titles and bodies are
 337 --   wrapped in one big div.ArticleText.
 338 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 339 parse_bodies xml =
 340   xml >>> css "div.ArticleText"
 341
 342
 343 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 344 fp_parse_articles xml = do
 345   bodies <- runX . xshow $ parse_bodies xml
 346   let article_separator = "<h2 class=\"SummaryHL\">"
 347   let split_articles'' = split article_separator (concat bodies)
 348   -- The first element will contain the crap before the first <h2...>.
 349   let split_articles' = case split_articles'' of
 350                           (_:_) -> tail split_articles''
 351                           []    -> []
 352   -- Put the separator back, it was lost during the split.
 353   let split_articles = map (article_separator ++) split_articles'
 354   real_articles <- mapM parse_html_article split_articles
 355   let just_articles = catMaybes real_articles
 356   return just_articles
 357
 358
 359 -- | This makes it easy to select otherwise-random chunks of html
 360 --   using 'css'.
 361 wrap_in_body_div :: String -> String
 362 wrap_in_body_div s =
 363   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 364
 365
 366
 367
 368 --
 369 -- Epublishable stuff
 370 --
 371
 372 title :: Page -> String
 373 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 374 title (FullPage hl _) = hl
 375
 376
 377 metadata :: Page -> IO String
 378 metadata obj = do
 379   date <- getCurrentTime
 380   return $
 381     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 382     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 383     "<dc:language>en-US</dc:language>\n" ++
 384     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 385     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 386
 387
 388 epublish :: Page -> Handle -> IO ()
 389 epublish obj handle = do
 390   let xhtml = to_xhtml obj
 391   epmd <- metadata obj
 392   epub <- xhtml_to_epub epmd xhtml
 393   B.hPut handle epub
 394   hFlush handle
 395   hClose handle
 396
 397 xhtml_to_epub :: String -> String -> IO B.ByteString
 398 xhtml_to_epub epmd =
 399    write_epub . read_html
 400    where
 401      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 402      write_epub = writeEPUB Nothing [] my_writer_options
 403      read_html  = readHtml defaultParserState
 404
 405
 406 --
 407 -- Tests
 408 --
 409
 410 test_preprocess_links :: Assertion
 411 test_preprocess_links = do
 412   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 413   let actual_xml = actual_xml' !! 0
 414
 415   expected_xml' <- runX $ expected_xml'' >>> css "body"
 416   let expected_xml = expected_xml' !! 0
 417
 418   assertEqual
 419     "Links are replaced with spans"
 420     expected_xml
 421     actual_xml
 422   where
 423     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 424     input_xml  = parseHtml input_html
 425     expected_html = "<body><span>Hello, world!</span></body>"
 426     expected_xml'' = parseHtml expected_html
 427
 428
 429 test_absolve_images :: Assertion
 430 test_absolve_images = do
 431   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 432   let actual_xml = actual_xml' !! 0
 433
 434   expected_xml' <- runX $ expected_xml'' >>> css "body"
 435   let expected_xml = expected_xml' !! 0
 436
 437   assertEqual
 438     "Image srcs are made absolute"
 439     expected_xml
 440     actual_xml
 441   where
 442     input_html =
 443       "<body>" ++
 444       "<img src=\"/images/2012/example.jpg\" />" ++
 445       "</body>"
 446     input_xml  = parseHtml input_html
 447     expected_html =
 448       "<body>" ++
 449       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 450       "</body>"
 451     expected_xml'' = parseHtml expected_html
 452
 453
 454 test_comments_removed :: Assertion
 455 test_comments_removed = do
 456   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 457   let actual_xml = actual_xml' !! 0
 458
 459   expected_xml' <- runX $ expected_xml'' >>> css "body"
 460   let expected_xml = expected_xml' !! 0
 461
 462   assertEqual
 463     "Comment links are removed"
 464     expected_xml
 465     actual_xml
 466   where
 467     input_html =
 468       "<body><p>" ++
 469       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 470       "</p></body>"
 471     input_xml  = parseHtml input_html
 472
 473     expected_html  = "<body><p></p></body>"
 474     expected_xml'' = parseHtml expected_html
 475
 476
 477 test_full_story_urls_parsed :: Assertion
 478 test_full_story_urls_parsed = do
 479   actual <- runX $ actual'
 480
 481   assertEqual
 482     "Full Story URLs are parsed"
 483     expected
 484     actual
 485   where
 486     expected = ["/Articles/500738/", "/Articles/501837/"]
 487
 488     full_story_html =
 489       concat ["<p>",
 490               "<a href=\"/Articles/500738/\">Full Story</a> ",
 491               "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
 492               "<p>",
 493               "<a href=\"/Articles/501837/\">Full Story</a> ",
 494               "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
 495               "<p>"]
 496
 497     xml = parseHtml full_story_html
 498     actual' = xml >>> full_story_urls
 499
 500 page_tests :: Test
 501 page_tests =
 502   testGroup "Page Tests" [
 503     testCase "Links are replaced with spans" test_preprocess_links,
 504     testCase "Image srcs are made absolute" test_absolve_images,
 505     testCase "Comment links are removed" test_comments_removed,
 506     testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]