src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import Control.Concurrent.ParallelIO (parallel)
   7 import qualified Data.Map as Map (lookup)
   8 import Data.Time (getCurrentTime)
   9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  10 import Data.String.Utils (split, strip)
  11 import qualified Data.Map as Map (Map, empty, insert)
  12 import Data.Maybe (catMaybes, fromJust, isNothing)
  13 import Prelude hiding (readFile)
  14 import System.IO (Handle, hClose, hFlush)
  15 import Test.HUnit (Assertion, assertEqual)
  16 import Test.Framework (Test, testGroup)
  17 import Test.Framework.Providers.HUnit (testCase)
  18 import Text.Pandoc (
  19   defaultParserState,
  20   defaultWriterOptions,
  21   readHtml,
  22   writeEPUB,
  23   writerEPUBMetadata)
  24 import Text.XML.HXT.Core (
  25   ArrowXml,
  26   IOSArrow,
  27   XmlTree,
  28   ($<),
  29   (>>>),
  30   (/>),
  31   (//>),
  32   changeAttrValue,
  33   getAttrValue,
  34   getChildren,
  35   getText,
  36   hasName,
  37   processAttrl,
  38   processTopDown,
  39   this,
  40   runX,
  41   xshow,
  42   when)
  43 import Text.HandsomeSoup (css, parseHtml)
  44
  45 import Configuration (Cfg, full_stories)
  46 import LWN.Article
  47 import LWN.HTTP (
  48   ImageMap,
  49   download_image_urls,
  50   get_article_contents)
  51 import LWN.URI (URL)
  52 import LWN.XHTML (
  53   XHTML,
  54   full_story_urls,
  55   image_srcs,
  56   full_story_link,
  57   full_story_paragraph,
  58   is_image,
  59   preprocess,
  60   remove_byline,
  61   remove_title,
  62   to_xhtml,
  63   to_xml,
  64   xml_from_contents)
  65
  66
  67
  68
  69 data Page =
  70   -- | An LWN page with one article on it.
  71   ArticlePage { article :: Article } |
  72
  73   -- | An LWN page with more than one article on it. These require
  74   --   different parsing and display functions than the single-article
  75   --   pages.
  76   FullPage { headline :: String,
  77              articles :: [Article] }
  78
  79
  80 instance XHTML Page where
  81   to_xhtml (ArticlePage a) =
  82     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  83     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  84     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  85     "<head>" ++
  86     "  <meta http-equiv=\"Content-Type\"" ++
  87     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  88     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  89     "</head>" ++
  90     "<body>" ++
  91     (to_xhtml a) ++
  92     "</body>" ++
  93     "</html>"
  94
  95   to_xhtml (FullPage hl as) =
  96     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  97     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  98     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  99     "<head>" ++
 100     "  <meta http-equiv=\"Content-Type\"" ++
 101     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 102     "  <title>" ++ hl ++ "</title>" ++
 103     "</head>" ++
 104     "<body>" ++
 105     "<div>" ++
 106     "<h1>" ++ hl ++ "</h1>" ++
 107     (concatMap to_xhtml as) ++
 108     "</div>" ++
 109     "</body>" ++
 110     "</html>"
 111
 112
 113
 114 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 115 page_from_url cfg url = do
 116   contents <- get_article_contents cfg url
 117   case (xml_from_contents contents) of
 118     Just html -> parse cfg html
 119     Nothing -> return Nothing
 120
 121
 122
 123 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
 124 insert_full_stories story_map =
 125   processTopDown (article_xml `when` full_story_paragraph)
 126   where
 127     lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
 128     lookup_func href =
 129       case Map.lookup href story_map of
 130         -- Leave it alone if we don't have the full story.
 131         Nothing -> this
 132         Just v -> to_xml v
 133
 134     article_xml :: (ArrowXml a) => a XmlTree XmlTree
 135     article_xml =
 136       lookup_func
 137       $<
 138       (this /> full_story_link >>> getAttrValue "href")
 139
 140 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 141 replace_remote_img_srcs image_map =
 142   processTopDown (make_srcs_local `when` is_image)
 143   where
 144     -- old_src -> new_src
 145     change_src_func :: String -> String
 146     change_src_func old_src =
 147       case Map.lookup old_src image_map of
 148         -- Leave it alone if we don't have the file locally
 149         Nothing -> old_src
 150         Just v -> v
 151
 152     change_src :: (ArrowXml a) => a XmlTree XmlTree
 153     change_src =
 154       changeAttrValue change_src_func
 155
 156     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 157     make_srcs_local =
 158       processAttrl $ (change_src `when` (hasName "src"))
 159
 160
 161
 162
 163 -- Should be called *after* preprocessing.
 164 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 165 download_images xml = do
 166   image_urls <- runX $ xml >>> image_srcs
 167   download_image_urls image_urls
 168
 169
 170
 171 type StoryMap = Map.Map URL Article
 172
 173 -- These come *before* preprocessing.
 174 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
 175 download_full_story_urls cfg story_urls = do
 176   pages <- parallel $ map (page_from_url cfg) story_urls
 177   let pairs = zip story_urls pages
 178   return $ foldl my_insert empty_map pairs
 179   where
 180     empty_map = Map.empty :: StoryMap
 181
 182     my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
 183     my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
 184     my_insert dict (_, _)  = dict
 185
 186
 187 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
 188 download_full_stories cfg xml = do
 189   story_urls <- runX $ xml >>> full_story_urls
 190   download_full_story_urls cfg story_urls
 191
 192
 193 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 194 parse cfg xml = do
 195   story_map <- download_full_stories cfg xml
 196   let fs_xml = if (full_stories cfg) then
 197                  xml >>> insert_full_stories story_map
 198                else
 199                  xml
 200
 201   let clean_xml = fs_xml >>> preprocess
 202   image_map <- download_images clean_xml
 203   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 204
 205   appr <- ap_parse local_xml
 206   fppr <- fp_parse local_xml
 207   return $
 208     if (isNothing appr) then
 209       fppr
 210     else
 211       appr
 212
 213
 214
 215 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 216 parse_headline xml = do
 217   let element_filter = xml >>> css "div.PageHeadline h1"
 218   let element_text_filter = element_filter /> getText
 219   element_text <- runX element_text_filter
 220   return $
 221     case element_text of
 222       [x] -> Just $ strip x
 223       []  -> Nothing
 224       _   -> error "Found more than one headline."
 225
 226
 227 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 228 parse_byline xml = do
 229   let element_filter = xml >>> css "div.FeatureByLine"
 230   let element_text_filter = element_filter /> getText
 231   element_text <- runX element_text_filter
 232   return $
 233     case element_text of
 234       [x] -> Just $ strip x
 235       []  -> Nothing
 236       _   -> error "Found more than one article byline."
 237
 238
 239 --
 240 -- ArticlePage Stuff
 241 --
 242 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 243 ap_parse xml = do
 244     arts <- ap_parse_articles xml
 245     case arts of
 246       [x] -> return $ Just $ ArticlePage x
 247       _   -> return Nothing
 248
 249
 250 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 251 ap_parse_body xml = do
 252   let element_filter = xml >>> css "div.ArticleText"
 253   let element_html_filter = xshow element_filter
 254   element_html <- runX element_html_filter
 255   return $ case element_html of
 256             [x] -> Just x
 257             []  -> Nothing
 258             _   -> error "Found more than one article."
 259
 260
 261 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 262 ap_parse_articles xml = do
 263   parsed_headline <- parse_headline xml
 264   parsed_byline   <- parse_byline xml
 265   parsed_body     <- ap_parse_body xml
 266
 267   if (isNothing parsed_headline) || (isNothing parsed_body)
 268   then return []
 269   else do
 270     let title'  = Title    $ fromJust parsed_headline
 271     let byline' = Byline     parsed_byline
 272     let body'   = BodyHtml $ fromJust parsed_body
 273
 274     return $ [Article title' byline' body']
 275
 276
 277
 278 --
 279 -- FullPage Stuff
 280 --
 281
 282 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 283 fp_parse xml = do
 284     hl <- parse_headline xml
 285     parsed_articles <- fp_parse_articles xml
 286     case parsed_articles of
 287       []          -> return Nothing
 288       x -> return $ Just $ FullPage (fromJust hl) x
 289
 290
 291
 292 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 293 fp_parse_article_title xml = do
 294   let element_filter = xml >>> css "h2.SummaryHL"
 295   let element_text_filter = element_filter //> getText
 296   element_text <- runX element_text_filter
 297   return $ case element_text of
 298             [x] -> Just $ strip x
 299             []  -> Nothing
 300             _   -> error "Found more than one article title."
 301
 302
 303
 304
 305 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 306 fp_parse_article_body xml = do
 307   -- First, delete the article title and byline.
 308   let clean_xml' = xml >>> remove_title >>> remove_byline
 309   -- The only child of the body element should be a div.lwn-article
 310   -- since we wrapped the article's HTML in that.
 311   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 312   clean_html <- runX . xshow $ clean_xml
 313   return $ case clean_html of
 314             [x] -> Just x
 315             []  -> Nothing
 316             _   -> error "Found more than one article body."
 317
 318 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 319 fp_parse_article xml = do
 320   parsed_article_title    <- fp_parse_article_title xml
 321   parsed_article_byline   <- parse_byline xml
 322   parsed_article_body     <- fp_parse_article_body xml
 323
 324   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 325   then
 326     return Nothing
 327   else do
 328     let title'   = Title    $ fromJust parsed_article_title
 329     let byline'  = Byline     parsed_article_byline
 330     let body'    = BodyHtml $ fromJust parsed_article_body
 331     return $ Just $ Article title' byline' body'
 332
 333 parse_html_article :: String -> IO (Maybe Article)
 334 parse_html_article html = do
 335   let xml = parseHtml $ wrap_in_body_div html
 336   fp_parse_article xml
 337
 338
 339 -- | In the full page, all of the article titles and bodies are
 340 --   wrapped in one big div.ArticleText.
 341 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 342 parse_bodies xml =
 343   xml >>> css "div.ArticleText"
 344
 345
 346 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 347 fp_parse_articles xml = do
 348   bodies <- runX . xshow $ parse_bodies xml
 349   let article_separator = "<h2 class=\"SummaryHL\">"
 350   let split_articles'' = split article_separator (concat bodies)
 351   -- The first element will contain the crap before the first <h2...>.
 352   let split_articles' = case split_articles'' of
 353                           (_:_) -> tail split_articles''
 354                           []    -> []
 355   -- Put the separator back, it was lost during the split.
 356   let split_articles = map (article_separator ++) split_articles'
 357   real_articles <- mapM parse_html_article split_articles
 358   let just_articles = catMaybes real_articles
 359   return just_articles
 360
 361
 362 -- | This makes it easy to select otherwise-random chunks of html
 363 --   using 'css'.
 364 wrap_in_body_div :: String -> String
 365 wrap_in_body_div s =
 366   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 367
 368
 369
 370
 371 --
 372 -- Epublishable stuff
 373 --
 374
 375 title :: Page -> String
 376 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 377 title (FullPage hl _) = hl
 378
 379
 380 metadata :: Page -> IO String
 381 metadata obj = do
 382   date <- getCurrentTime
 383   return $
 384     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 385     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 386     "<dc:language>en-US</dc:language>\n" ++
 387     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 388     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 389
 390
 391 epublish :: Page -> Handle -> IO ()
 392 epublish obj handle = do
 393   let xhtml = to_xhtml obj
 394   epmd <- metadata obj
 395   epub <- xhtml_to_epub epmd xhtml
 396   B.hPut handle epub
 397   hFlush handle
 398   hClose handle
 399
 400 xhtml_to_epub :: String -> String -> IO B.ByteString
 401 xhtml_to_epub epmd =
 402    write_epub . read_html
 403    where
 404      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 405      write_epub = writeEPUB Nothing [] my_writer_options
 406      read_html  = readHtml defaultParserState
 407
 408
 409 --
 410 -- Tests
 411 --
 412
 413 test_preprocess_links :: Assertion
 414 test_preprocess_links = do
 415   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 416   let actual_xml = actual_xml' !! 0
 417
 418   expected_xml' <- runX $ expected_xml'' >>> css "body"
 419   let expected_xml = expected_xml' !! 0
 420
 421   assertEqual
 422     "Links are replaced with spans"
 423     expected_xml
 424     actual_xml
 425   where
 426     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 427     input_xml  = parseHtml input_html
 428     expected_html = "<body><span>Hello, world!</span></body>"
 429     expected_xml'' = parseHtml expected_html
 430
 431
 432 test_absolve_images :: Assertion
 433 test_absolve_images = do
 434   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 435   let actual_xml = actual_xml' !! 0
 436
 437   expected_xml' <- runX $ expected_xml'' >>> css "body"
 438   let expected_xml = expected_xml' !! 0
 439
 440   assertEqual
 441     "Image srcs are made absolute"
 442     expected_xml
 443     actual_xml
 444   where
 445     input_html =
 446       "<body>" ++
 447       "<img src=\"/images/2012/example.jpg\" />" ++
 448       "</body>"
 449     input_xml  = parseHtml input_html
 450     expected_html =
 451       "<body>" ++
 452       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 453       "</body>"
 454     expected_xml'' = parseHtml expected_html
 455
 456
 457 test_comments_removed :: Assertion
 458 test_comments_removed = do
 459   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 460   let actual_xml = actual_xml' !! 0
 461
 462   expected_xml' <- runX $ expected_xml'' >>> css "body"
 463   let expected_xml = expected_xml' !! 0
 464
 465   assertEqual
 466     "Comment links are removed"
 467     expected_xml
 468     actual_xml
 469   where
 470     input_html =
 471       "<body><p>" ++
 472       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 473       "</p></body>"
 474     input_xml  = parseHtml input_html
 475
 476     expected_html  = "<body><p></p></body>"
 477     expected_xml'' = parseHtml expected_html
 478
 479
 480 test_full_story_urls_parsed :: Assertion
 481 test_full_story_urls_parsed = do
 482   actual <- runX $ actual'
 483
 484   assertEqual
 485     "Full Story URLs are parsed"
 486     expected
 487     actual
 488   where
 489     expected = ["/Articles/500738/", "/Articles/501837/"]
 490
 491     full_story_html =
 492       concat ["<p>",
 493               "<a href=\"/Articles/500738/\">Full Story</a> ",
 494               "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
 495               "<p>",
 496               "<a href=\"/Articles/501837/\">Full Story</a> ",
 497               "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
 498               "<p>"]
 499
 500     xml = parseHtml full_story_html
 501     actual' = xml >>> full_story_urls
 502
 503 page_tests :: Test
 504 page_tests =
 505   testGroup "Page Tests" [
 506     testCase "Links are replaced with spans" test_preprocess_links,
 507     testCase "Image srcs are made absolute" test_absolve_images,
 508     testCase "Comment links are removed" test_comments_removed,
 509     testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]