src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import Control.Concurrent.ParallelIO (parallel)
   7 import qualified Data.Map as Map (lookup)
   8 import Data.Time (getCurrentTime)
   9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  10 import Data.String.Utils (split, strip)
  11 import qualified Data.Map as Map (Map, empty, insert)
  12 import Data.Maybe (catMaybes, fromJust, isNothing)
  13 import Prelude hiding (readFile)
  14 import System.IO (Handle, hClose, hFlush)
  15 import Test.HUnit (Assertion, assertEqual)
  16 import Test.Framework (Test, testGroup)
  17 import Test.Framework.Providers.HUnit (testCase)
  18 import Text.Pandoc (
  19   defaultParserState,
  20   defaultWriterOptions,
  21   readHtml,
  22   writeEPUB,
  23   writerEPUBMetadata)
  24 import Text.XML.HXT.Core (
  25   ArrowXml,
  26   IOSArrow,
  27   XmlTree,
  28   ($<),
  29   (>>>),
  30   (/>),
  31   (//>),
  32   changeAttrValue,
  33   getAttrValue,
  34   getChildren,
  35   getText,
  36   hasName,
  37   processAttrl,
  38   processTopDown,
  39   this,
  40   runX,
  41   xshow,
  42   when)
  43 import Text.HandsomeSoup (css, parseHtml)
  44
  45 import Configuration (Cfg)
  46 import LWN.Article
  47 import LWN.HTTP (
  48   ImageMap,
  49   download_image_urls,
  50   get_article_contents)
  51 import LWN.URI (URL)
  52 import LWN.XHTML (
  53   XHTML,
  54   full_story_urls,
  55   image_srcs,
  56   full_story_link,
  57   full_story_paragraph,
  58   is_image,
  59   preprocess,
  60   remove_byline,
  61   remove_title,
  62   to_xhtml,
  63   to_xml,
  64   xml_from_contents)
  65
  66
  67
  68
  69 data Page =
  70   -- | An LWN page with one article on it.
  71   ArticlePage { article :: Article } |
  72
  73   -- | An LWN page with more than one article on it. These require
  74   --   different parsing and display functions than the single-article
  75   --   pages.
  76   FullPage { headline :: String,
  77              articles :: [Article] }
  78
  79
  80 instance XHTML Page where
  81   to_xhtml (ArticlePage a) =
  82     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  83     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  84     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  85     "<head>" ++
  86     "  <meta http-equiv=\"Content-Type\"" ++
  87     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  88     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  89     "</head>" ++
  90     "<body>" ++
  91     (to_xhtml a) ++
  92     "</body>" ++
  93     "</html>"
  94
  95   to_xhtml (FullPage hl as) =
  96     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  97     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  98     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  99     "<head>" ++
 100     "  <meta http-equiv=\"Content-Type\"" ++
 101     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 102     "  <title>" ++ hl ++ "</title>" ++
 103     "</head>" ++
 104     "<body>" ++
 105     "<div>" ++
 106     "<h1>" ++ hl ++ "</h1>" ++
 107     (concatMap to_xhtml as) ++
 108     "</div>" ++
 109     "</body>" ++
 110     "</html>"
 111
 112
 113
 114 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 115 page_from_url cfg url = do
 116   contents <- get_article_contents cfg url
 117   case (xml_from_contents contents) of
 118     Just html -> parse cfg html
 119     Nothing -> return Nothing
 120
 121
 122
 123 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
 124 insert_full_stories story_map =
 125   processTopDown (article_xml `when` full_story_paragraph)
 126   where
 127     lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
 128     lookup_func href =
 129       case Map.lookup href story_map of
 130         -- Leave it alone if we don't have the full story.
 131         Nothing -> this
 132         Just v -> to_xml v
 133
 134     article_xml :: (ArrowXml a) => a XmlTree XmlTree
 135     article_xml =
 136       lookup_func
 137       $<
 138       (this /> full_story_link >>> getAttrValue "href")
 139
 140 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 141 replace_remote_img_srcs image_map =
 142   processTopDown (make_srcs_local `when` is_image)
 143   where
 144     -- old_src -> new_src
 145     change_src_func :: String -> String
 146     change_src_func old_src =
 147       case Map.lookup old_src image_map of
 148         -- Leave it alone if we don't have the file locally
 149         Nothing -> old_src
 150         Just v -> v
 151
 152     change_src :: (ArrowXml a) => a XmlTree XmlTree
 153     change_src =
 154       changeAttrValue change_src_func
 155
 156     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 157     make_srcs_local =
 158       processAttrl $ (change_src `when` (hasName "src"))
 159
 160
 161
 162
 163 -- Should be called *after* preprocessing.
 164 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 165 download_images xml = do
 166   image_urls <- runX $ xml >>> image_srcs
 167   download_image_urls image_urls
 168
 169
 170
 171 type StoryMap = Map.Map URL Article
 172
 173 -- These come *before* preprocessing.
 174 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
 175 download_full_story_urls cfg story_urls = do
 176   pages <- parallel $ map (page_from_url cfg) story_urls
 177   let pairs = zip story_urls pages
 178   return $ foldl my_insert empty_map pairs
 179   where
 180     empty_map = Map.empty :: StoryMap
 181
 182     my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
 183     my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
 184     my_insert dict (_, _)  = dict
 185
 186
 187 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
 188 download_full_stories cfg xml = do
 189   story_urls <- runX $ xml >>> full_story_urls
 190   download_full_story_urls cfg story_urls
 191
 192
 193 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 194 parse cfg xml = do
 195   story_map <- download_full_stories cfg xml
 196   let fs_xml = xml >>> insert_full_stories story_map
 197
 198   let clean_xml = fs_xml >>> preprocess
 199   image_map <- download_images clean_xml
 200   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 201
 202   appr <- ap_parse local_xml
 203   fppr <- fp_parse local_xml
 204   return $
 205     if (isNothing appr) then
 206       fppr
 207     else
 208       appr
 209
 210
 211
 212 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 213 parse_headline xml = do
 214   let element_filter = xml >>> css "div.PageHeadline h1"
 215   let element_text_filter = element_filter /> getText
 216   element_text <- runX element_text_filter
 217   return $
 218     case element_text of
 219       [x] -> Just $ strip x
 220       []  -> Nothing
 221       _   -> error "Found more than one headline."
 222
 223
 224 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 225 parse_byline xml = do
 226   let element_filter = xml >>> css "div.FeatureByLine"
 227   let element_text_filter = element_filter /> getText
 228   element_text <- runX element_text_filter
 229   return $
 230     case element_text of
 231       [x] -> Just $ strip x
 232       []  -> Nothing
 233       _   -> error "Found more than one article byline."
 234
 235
 236 --
 237 -- ArticlePage Stuff
 238 --
 239 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 240 ap_parse xml = do
 241     arts <- ap_parse_articles xml
 242     case arts of
 243       [x] -> return $ Just $ ArticlePage x
 244       _   -> return Nothing
 245
 246
 247 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 248 ap_parse_body xml = do
 249   let element_filter = xml >>> css "div.ArticleText"
 250   let element_html_filter = xshow element_filter
 251   element_html <- runX element_html_filter
 252   return $ case element_html of
 253             [x] -> Just x
 254             []  -> Nothing
 255             _   -> error "Found more than one article."
 256
 257
 258 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 259 ap_parse_articles xml = do
 260   parsed_headline <- parse_headline xml
 261   parsed_byline   <- parse_byline xml
 262   parsed_body     <- ap_parse_body xml
 263
 264   putStrLn $ fromJust parsed_headline
 265
 266   if (isNothing parsed_headline) || (isNothing parsed_body)
 267   then return []
 268   else do
 269     let title'  = Title    $ fromJust parsed_headline
 270     let byline' = Byline     parsed_byline
 271     let body'   = BodyHtml $ fromJust parsed_body
 272
 273     return $ [Article title' byline' body']
 274
 275
 276
 277 --
 278 -- FullPage Stuff
 279 --
 280
 281 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 282 fp_parse xml = do
 283     hl <- parse_headline xml
 284     parsed_articles <- fp_parse_articles xml
 285     case parsed_articles of
 286       []          -> return Nothing
 287       x -> return $ Just $ FullPage (fromJust hl) x
 288
 289
 290
 291 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 292 fp_parse_article_title xml = do
 293   let element_filter = xml >>> css "h2.SummaryHL"
 294   let element_text_filter = element_filter //> getText
 295   element_text <- runX element_text_filter
 296   return $ case element_text of
 297             [x] -> Just $ strip x
 298             []  -> Nothing
 299             _   -> error "Found more than one article title."
 300
 301
 302
 303
 304 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 305 fp_parse_article_body xml = do
 306   -- First, delete the article title and byline.
 307   let clean_xml' = xml >>> remove_title >>> remove_byline
 308   -- The only child of the body element should be a div.lwn-article
 309   -- since we wrapped the article's HTML in that.
 310   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 311   clean_html <- runX . xshow $ clean_xml
 312   return $ case clean_html of
 313             [x] -> Just x
 314             []  -> Nothing
 315             _   -> error "Found more than one article body."
 316
 317 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 318 fp_parse_article xml = do
 319   parsed_article_title    <- fp_parse_article_title xml
 320   parsed_article_byline   <- parse_byline xml
 321   parsed_article_body     <- fp_parse_article_body xml
 322
 323   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 324   then
 325     return Nothing
 326   else do
 327     let title'   = Title    $ fromJust parsed_article_title
 328     let byline'  = Byline     parsed_article_byline
 329     let body'    = BodyHtml $ fromJust parsed_article_body
 330     return $ Just $ Article title' byline' body'
 331
 332 parse_html_article :: String -> IO (Maybe Article)
 333 parse_html_article html = do
 334   let xml = parseHtml $ wrap_in_body_div html
 335   fp_parse_article xml
 336
 337
 338 -- | In the full page, all of the article titles and bodies are
 339 --   wrapped in one big div.ArticleText.
 340 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 341 parse_bodies xml =
 342   xml >>> css "div.ArticleText"
 343
 344
 345 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 346 fp_parse_articles xml = do
 347   bodies <- runX . xshow $ parse_bodies xml
 348   let article_separator = "<h2 class=\"SummaryHL\">"
 349   let split_articles'' = split article_separator (concat bodies)
 350   -- The first element will contain the crap before the first <h2...>.
 351   let split_articles' = case split_articles'' of
 352                           (_:_) -> tail split_articles''
 353                           []    -> []
 354   -- Put the separator back, it was lost during the split.
 355   let split_articles = map (article_separator ++) split_articles'
 356   real_articles <- mapM parse_html_article split_articles
 357   let just_articles = catMaybes real_articles
 358   return just_articles
 359
 360
 361 -- | This makes it easy to select otherwise-random chunks of html
 362 --   using 'css'.
 363 wrap_in_body_div :: String -> String
 364 wrap_in_body_div s =
 365   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 366
 367
 368
 369
 370 --
 371 -- Epublishable stuff
 372 --
 373
 374 title :: Page -> String
 375 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 376 title (FullPage hl _) = hl
 377
 378
 379 metadata :: Page -> IO String
 380 metadata obj = do
 381   date <- getCurrentTime
 382   return $
 383     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 384     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 385     "<dc:language>en-US</dc:language>\n" ++
 386     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 387     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 388
 389
 390 epublish :: Page -> Handle -> IO ()
 391 epublish obj handle = do
 392   let xhtml = to_xhtml obj
 393   epmd <- metadata obj
 394   epub <- xhtml_to_epub epmd xhtml
 395   B.hPut handle epub
 396   hFlush handle
 397   hClose handle
 398
 399 xhtml_to_epub :: String -> String -> IO B.ByteString
 400 xhtml_to_epub epmd =
 401    write_epub . read_html
 402    where
 403      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 404      write_epub = writeEPUB Nothing [] my_writer_options
 405      read_html  = readHtml defaultParserState
 406
 407
 408 --
 409 -- Tests
 410 --
 411
 412 test_preprocess_links :: Assertion
 413 test_preprocess_links = do
 414   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 415   let actual_xml = actual_xml' !! 0
 416
 417   expected_xml' <- runX $ expected_xml'' >>> css "body"
 418   let expected_xml = expected_xml' !! 0
 419
 420   assertEqual
 421     "Links are replaced with spans"
 422     expected_xml
 423     actual_xml
 424   where
 425     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 426     input_xml  = parseHtml input_html
 427     expected_html = "<body><span>Hello, world!</span></body>"
 428     expected_xml'' = parseHtml expected_html
 429
 430
 431 test_absolve_images :: Assertion
 432 test_absolve_images = do
 433   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 434   let actual_xml = actual_xml' !! 0
 435
 436   expected_xml' <- runX $ expected_xml'' >>> css "body"
 437   let expected_xml = expected_xml' !! 0
 438
 439   assertEqual
 440     "Image srcs are made absolute"
 441     expected_xml
 442     actual_xml
 443   where
 444     input_html =
 445       "<body>" ++
 446       "<img src=\"/images/2012/example.jpg\" />" ++
 447       "</body>"
 448     input_xml  = parseHtml input_html
 449     expected_html =
 450       "<body>" ++
 451       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 452       "</body>"
 453     expected_xml'' = parseHtml expected_html
 454
 455
 456 test_comments_removed :: Assertion
 457 test_comments_removed = do
 458   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 459   let actual_xml = actual_xml' !! 0
 460
 461   expected_xml' <- runX $ expected_xml'' >>> css "body"
 462   let expected_xml = expected_xml' !! 0
 463
 464   assertEqual
 465     "Comment links are removed"
 466     expected_xml
 467     actual_xml
 468   where
 469     input_html =
 470       "<body><p>" ++
 471       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 472       "</p></body>"
 473     input_xml  = parseHtml input_html
 474
 475     expected_html  = "<body><p></p></body>"
 476     expected_xml'' = parseHtml expected_html
 477
 478
 479 test_full_story_urls_parsed :: Assertion
 480 test_full_story_urls_parsed = do
 481   actual <- runX $ actual'
 482
 483   assertEqual
 484     "Full Story URLs are parsed"
 485     expected
 486     actual
 487   where
 488     expected = ["/Articles/500738/", "/Articles/501837/"]
 489
 490     full_story_html =
 491       concat ["<p>",
 492               "<a href=\"/Articles/500738/\">Full Story</a> ",
 493               "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
 494               "<p>",
 495               "<a href=\"/Articles/501837/\">Full Story</a> ",
 496               "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
 497               "<p>"]
 498
 499     xml = parseHtml full_story_html
 500     actual' = xml >>> full_story_urls
 501
 502 page_tests :: Test
 503 page_tests =
 504   testGroup "Page Tests" [
 505     testCase "Links are replaced with spans" test_preprocess_links,
 506     testCase "Image srcs are made absolute" test_absolve_images,
 507     testCase "Comment links are removed" test_comments_removed,
 508     testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]