src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import Control.Concurrent.ParallelIO (parallel)
   7 import qualified Data.Map as Map (lookup)
   8 import Data.Time (getCurrentTime)
   9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  10 import Data.String.Utils (split, strip)
  11 import qualified Data.Map as Map (Map, empty, insert)
  12 import Data.Maybe (catMaybes, fromJust, isNothing)
  13 import Prelude hiding (readFile)
  14 import System.IO (Handle, hClose, hFlush)
  15 import Test.HUnit (Assertion, assertEqual)
  16 import Test.Framework (Test, testGroup)
  17 import Test.Framework.Providers.HUnit (testCase)
  18 import Text.Pandoc (
  19   defaultParserState,
  20   defaultWriterOptions,
  21   readHtml,
  22   writeEPUB,
  23   writerEPUBMetadata)
  24 import Text.XML.HXT.Core (
  25   ArrowXml,
  26   IOSArrow,
  27   XmlTree,
  28   ($<),
  29   (>>>),
  30   (/>),
  31   (//>),
  32   changeAttrValue,
  33   getAttrValue,
  34   getChildren,
  35   getText,
  36   hasName,
  37   none,
  38   processAttrl,
  39   processTopDown,
  40   this,
  41   runX,
  42   xshow,
  43   when)
  44 import Text.HandsomeSoup (css, parseHtml)
  45
  46 import Configuration (Cfg, full_stories)
  47 import LWN.Article
  48 import LWN.HTTP (
  49   ImageMap,
  50   download_image_urls,
  51   get_article_contents)
  52 import LWN.URI (URL)
  53 import LWN.XHTML (
  54   XHTML,
  55   full_story_urls,
  56   image_srcs,
  57   full_story_link,
  58   full_story_paragraph,
  59   is_image,
  60   preprocess,
  61   remove_byline,
  62   remove_full_story_paragraphs,
  63   remove_title,
  64   to_xhtml,
  65   to_xml,
  66   xml_from_contents)
  67
  68
  69
  70
  71 data Page =
  72   -- | An LWN page with one article on it.
  73   ArticlePage { article :: Article } |
  74
  75   -- | An LWN page with more than one article on it. These require
  76   --   different parsing and display functions than the single-article
  77   --   pages.
  78   FullPage { headline :: String,
  79              articles :: [Article] }
  80
  81
  82 instance XHTML Page where
  83   to_xhtml (ArticlePage a) =
  84     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  85     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  86     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  87     "<head>" ++
  88     "  <meta http-equiv=\"Content-Type\"" ++
  89     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  90     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  91     "</head>" ++
  92     "<body>" ++
  93     (to_xhtml a) ++
  94     "</body>" ++
  95     "</html>"
  96
  97   to_xhtml (FullPage hl as) =
  98     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  99     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
 100     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
 101     "<head>" ++
 102     "  <meta http-equiv=\"Content-Type\"" ++
 103     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 104     "  <title>" ++ hl ++ "</title>" ++
 105     "</head>" ++
 106     "<body>" ++
 107     "<div>" ++
 108     "<h1>" ++ hl ++ "</h1>" ++
 109     (concatMap to_xhtml as) ++
 110     "</div>" ++
 111     "</body>" ++
 112     "</html>"
 113
 114
 115
 116 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 117 page_from_url cfg url = do
 118   contents <- get_article_contents cfg url
 119   case (xml_from_contents contents) of
 120     Just html -> parse cfg html
 121     Nothing -> return Nothing
 122
 123
 124
 125 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
 126 insert_full_stories story_map =
 127   processTopDown (article_xml `when` full_story_paragraph)
 128   where
 129     lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
 130     lookup_func href =
 131       case Map.lookup href story_map of
 132         -- Drop the paragraph if we don't have the contents.
 133         Nothing -> none
 134         Just v -> to_xml v
 135
 136     article_xml :: (ArrowXml a) => a XmlTree XmlTree
 137     article_xml =
 138       lookup_func
 139       $< -- From HXT's Control.Arrow.ArrowList
 140       (this /> full_story_link >>> getAttrValue "href")
 141
 142 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 143 replace_remote_img_srcs image_map =
 144   processTopDown (make_srcs_local `when` is_image)
 145   where
 146     -- old_src -> new_src
 147     change_src_func :: String -> String
 148     change_src_func old_src =
 149       case Map.lookup old_src image_map of
 150         -- Leave it alone if we don't have the file locally
 151         Nothing -> old_src
 152         Just v -> v
 153
 154     change_src :: (ArrowXml a) => a XmlTree XmlTree
 155     change_src =
 156       changeAttrValue change_src_func
 157
 158     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 159     make_srcs_local =
 160       processAttrl $ (change_src `when` (hasName "src"))
 161
 162
 163
 164
 165 -- Should be called *after* preprocessing.
 166 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 167 download_images xml = do
 168   image_urls <- runX $ xml >>> image_srcs
 169   download_image_urls image_urls
 170
 171
 172
 173 type StoryMap = Map.Map URL Article
 174
 175 -- These come *before* preprocessing.
 176 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
 177 download_full_story_urls cfg story_urls = do
 178   pages <- parallel $ map (page_from_url cfg) story_urls
 179   let pairs = zip story_urls pages
 180   return $ foldl my_insert empty_map pairs
 181   where
 182     empty_map = Map.empty :: StoryMap
 183
 184     my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
 185     my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
 186     my_insert dict (_, _)  = dict
 187
 188
 189 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
 190 download_full_stories cfg xml = do
 191   story_urls <- runX $ xml >>> full_story_urls
 192   download_full_story_urls cfg story_urls
 193
 194
 195 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 196 parse cfg xml = do
 197   fs_xml <- if (full_stories cfg) then do
 198              story_map <- download_full_stories cfg xml
 199              return $ xml >>> insert_full_stories story_map
 200            else do
 201              -- Get rid of them if we don't want them.
 202              return $ xml >>> remove_full_story_paragraphs
 203
 204   let clean_xml = fs_xml >>> preprocess
 205   image_map <- download_images clean_xml
 206   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 207
 208   appr <- ap_parse local_xml
 209   fppr <- fp_parse local_xml
 210   return $
 211     if (isNothing appr) then
 212       fppr
 213     else
 214       appr
 215
 216
 217
 218 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 219 parse_headline xml = do
 220   let element_filter = xml >>> css "div.PageHeadline h1"
 221   let element_text_filter = element_filter /> getText
 222   element_text <- runX element_text_filter
 223   return $
 224     case element_text of
 225       [x] -> Just $ strip x
 226       []  -> Nothing
 227       _   -> error "Found more than one headline."
 228
 229
 230 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 231 parse_byline xml = do
 232   let element_filter = xml >>> css "div.FeatureByLine"
 233   let element_text_filter = element_filter /> getText
 234   element_text <- runX element_text_filter
 235   return $
 236     case element_text of
 237       [x] -> Just $ strip x
 238       []  -> Nothing
 239       _   -> error "Found more than one article byline."
 240
 241
 242 --
 243 -- ArticlePage Stuff
 244 --
 245 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 246 ap_parse xml = do
 247     arts <- ap_parse_articles xml
 248     case arts of
 249       [x] -> return $ Just $ ArticlePage x
 250       _   -> return Nothing
 251
 252
 253 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 254 ap_parse_body xml = do
 255   let element_filter = xml >>> css "div.ArticleText"
 256   let element_html_filter = xshow element_filter
 257   element_html <- runX element_html_filter
 258   return $ case element_html of
 259             [x] -> Just x
 260             []  -> Nothing
 261             _   -> error "Found more than one article."
 262
 263
 264 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 265 ap_parse_articles xml = do
 266   parsed_headline <- parse_headline xml
 267   parsed_byline   <- parse_byline xml
 268   parsed_body     <- ap_parse_body xml
 269
 270   if (isNothing parsed_headline) || (isNothing parsed_body)
 271   then return []
 272   else do
 273     let title'  = Title    $ fromJust parsed_headline
 274     let byline' = Byline     parsed_byline
 275     let body'   = BodyHtml $ fromJust parsed_body
 276
 277     return $ [Article title' byline' body']
 278
 279
 280
 281 --
 282 -- FullPage Stuff
 283 --
 284
 285 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 286 fp_parse xml = do
 287     hl <- parse_headline xml
 288     parsed_articles <- fp_parse_articles xml
 289     case parsed_articles of
 290       []          -> return Nothing
 291       x -> return $ Just $ FullPage (fromJust hl) x
 292
 293
 294
 295 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 296 fp_parse_article_title xml = do
 297   let element_filter = xml >>> css "h2.SummaryHL"
 298   let element_text_filter = element_filter //> getText
 299   element_text <- runX element_text_filter
 300   return $ case element_text of
 301             [x] -> Just $ strip x
 302             []  -> Nothing
 303             _   -> error "Found more than one article title."
 304
 305
 306
 307
 308 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 309 fp_parse_article_body xml = do
 310   -- First, delete the article title and byline.
 311   let clean_xml' = xml >>> remove_title >>> remove_byline
 312   -- The only child of the body element should be a div.lwn-article
 313   -- since we wrapped the article's HTML in that.
 314   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 315   clean_html <- runX . xshow $ clean_xml
 316   return $ case clean_html of
 317             [x] -> Just x
 318             []  -> Nothing
 319             _   -> error "Found more than one article body."
 320
 321 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 322 fp_parse_article xml = do
 323   parsed_article_title    <- fp_parse_article_title xml
 324   parsed_article_byline   <- parse_byline xml
 325   parsed_article_body     <- fp_parse_article_body xml
 326
 327   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 328   then
 329     return Nothing
 330   else do
 331     let title'   = Title    $ fromJust parsed_article_title
 332     let byline'  = Byline     parsed_article_byline
 333     let body'    = BodyHtml $ fromJust parsed_article_body
 334     return $ Just $ Article title' byline' body'
 335
 336 parse_html_article :: String -> IO (Maybe Article)
 337 parse_html_article html = do
 338   let xml = parseHtml $ wrap_in_body_div html
 339   fp_parse_article xml
 340
 341
 342 -- | In the full page, all of the article titles and bodies are
 343 --   wrapped in one big div.ArticleText.
 344 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 345 parse_bodies xml =
 346   xml >>> css "div.ArticleText"
 347
 348
 349 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 350 fp_parse_articles xml = do
 351   bodies <- runX . xshow $ parse_bodies xml
 352   let article_separator = "<h2 class=\"SummaryHL\">"
 353   let split_articles'' = split article_separator (concat bodies)
 354   -- The first element will contain the crap before the first <h2...>.
 355   let split_articles' = case split_articles'' of
 356                           (_:_) -> tail split_articles''
 357                           []    -> []
 358   -- Put the separator back, it was lost during the split.
 359   let split_articles = map (article_separator ++) split_articles'
 360   real_articles <- mapM parse_html_article split_articles
 361   let just_articles = catMaybes real_articles
 362   return just_articles
 363
 364
 365 -- | This makes it easy to select otherwise-random chunks of html
 366 --   using 'css'.
 367 wrap_in_body_div :: String -> String
 368 wrap_in_body_div s =
 369   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 370
 371
 372
 373
 374 --
 375 -- Epublishable stuff
 376 --
 377
 378 title :: Page -> String
 379 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 380 title (FullPage hl _) = hl
 381
 382
 383 metadata :: Page -> IO String
 384 metadata obj = do
 385   date <- getCurrentTime
 386   return $
 387     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 388     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 389     "<dc:language>en-US</dc:language>\n" ++
 390     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 391     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 392
 393
 394 epublish :: Page -> Handle -> IO ()
 395 epublish obj handle = do
 396   let xhtml = to_xhtml obj
 397   epmd <- metadata obj
 398   epub <- xhtml_to_epub epmd xhtml
 399   B.hPut handle epub
 400   hFlush handle
 401   hClose handle
 402
 403 xhtml_to_epub :: String -> String -> IO B.ByteString
 404 xhtml_to_epub epmd =
 405    write_epub . read_html
 406    where
 407      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 408      write_epub = writeEPUB Nothing [] my_writer_options
 409      read_html  = readHtml defaultParserState
 410
 411
 412 --
 413 -- Tests
 414 --
 415
 416 test_preprocess_links :: Assertion
 417 test_preprocess_links = do
 418   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 419   let actual_xml = actual_xml' !! 0
 420
 421   expected_xml' <- runX $ expected_xml'' >>> css "body"
 422   let expected_xml = expected_xml' !! 0
 423
 424   assertEqual
 425     "Links are replaced with spans"
 426     expected_xml
 427     actual_xml
 428   where
 429     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 430     input_xml  = parseHtml input_html
 431     expected_html = "<body><span>Hello, world!</span></body>"
 432     expected_xml'' = parseHtml expected_html
 433
 434
 435 test_absolve_images :: Assertion
 436 test_absolve_images = do
 437   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 438   let actual_xml = actual_xml' !! 0
 439
 440   expected_xml' <- runX $ expected_xml'' >>> css "body"
 441   let expected_xml = expected_xml' !! 0
 442
 443   assertEqual
 444     "Image srcs are made absolute"
 445     expected_xml
 446     actual_xml
 447   where
 448     input_html =
 449       "<body>" ++
 450       "<img src=\"/images/2012/example.jpg\" />" ++
 451       "</body>"
 452     input_xml  = parseHtml input_html
 453     expected_html =
 454       "<body>" ++
 455       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 456       "</body>"
 457     expected_xml'' = parseHtml expected_html
 458
 459
 460 test_comments_removed :: Assertion
 461 test_comments_removed = do
 462   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 463   let actual_xml = actual_xml' !! 0
 464
 465   expected_xml' <- runX $ expected_xml'' >>> css "body"
 466   let expected_xml = expected_xml' !! 0
 467
 468   assertEqual
 469     "Comment links are removed"
 470     expected_xml
 471     actual_xml
 472   where
 473     input_html =
 474       "<body><p>" ++
 475       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 476       "</p></body>"
 477     input_xml  = parseHtml input_html
 478
 479     expected_html  = "<body><p></p></body>"
 480     expected_xml'' = parseHtml expected_html
 481
 482
 483 test_full_story_urls_parsed :: Assertion
 484 test_full_story_urls_parsed = do
 485   actual <- runX $ actual'
 486
 487   assertEqual
 488     "Full Story URLs are parsed"
 489     expected
 490     actual
 491   where
 492     expected = ["/Articles/500738/", "/Articles/501837/"]
 493
 494     full_story_html =
 495       concat ["<p>",
 496               "<a href=\"/Articles/500738/\">Full Story</a> ",
 497               "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
 498               "<p>",
 499               "<a href=\"/Articles/501837/\">Full Story</a> ",
 500               "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
 501               "<p>"]
 502
 503     xml = parseHtml full_story_html
 504     actual' = xml >>> full_story_urls
 505
 506 page_tests :: Test
 507 page_tests =
 508   testGroup "Page Tests" [
 509     testCase "Links are replaced with spans" test_preprocess_links,
 510     testCase "Image srcs are made absolute" test_absolve_images,
 511     testCase "Comment links are removed" test_comments_removed,
 512     testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]