src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import qualified Data.Map as Map (lookup)
   7 import Data.Time (getCurrentTime)
   8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
   9 import Data.String.Utils (split, strip)
  10 import qualified Data.Map as Map (Map, empty, insert)
  11 import Data.Maybe (catMaybes, fromJust, isNothing)
  12 import Prelude hiding (readFile)
  13 import System.IO (Handle, hClose, hFlush)
  14 import Test.HUnit (Assertion, assertEqual)
  15 import Test.Framework (Test, testGroup)
  16 import Test.Framework.Providers.HUnit (testCase)
  17 import Text.Pandoc (
  18   defaultParserState,
  19   defaultWriterOptions,
  20   readHtml,
  21   writeEPUB,
  22   writerEPUBMetadata)
  23 import Text.XML.HXT.Core (
  24   ArrowXml,
  25   IOSArrow,
  26   XmlTree,
  27   ($<),
  28   (>>>),
  29   (/>),
  30   (//>),
  31   changeAttrValue,
  32   getAttrValue,
  33   getChildren,
  34   getText,
  35   hasName,
  36   processAttrl,
  37   processTopDown,
  38   this,
  39   runX,
  40   xshow,
  41   when)
  42 import Text.HandsomeSoup (css, parseHtml)
  43
  44 import Configuration (Cfg)
  45 import LWN.Article
  46 import LWN.HTTP (
  47   ImageMap,
  48   download_image_urls,
  49   get_article_contents)
  50 import LWN.URI (URL)
  51 import LWN.XHTML (
  52   XHTML,
  53   full_story_urls,
  54   image_srcs,
  55   full_story_link,
  56   full_story_paragraph,
  57   is_image,
  58   preprocess,
  59   remove_byline,
  60   remove_title,
  61   to_xhtml,
  62   to_xml,
  63   xml_from_contents)
  64
  65
  66
  67
  68 data Page =
  69   -- | An LWN page with one article on it.
  70   ArticlePage { article :: Article } |
  71
  72   -- | An LWN page with more than one article on it. These require
  73   --   different parsing and display functions than the single-article
  74   --   pages.
  75   FullPage { headline :: String,
  76              articles :: [Article] }
  77
  78
  79 instance XHTML Page where
  80   to_xhtml (ArticlePage a) =
  81     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  82     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  83     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  84     "<head>" ++
  85     "  <meta http-equiv=\"Content-Type\"" ++
  86     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  87     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  88     "</head>" ++
  89     "<body>" ++
  90     (to_xhtml a) ++
  91     "</body>" ++
  92     "</html>"
  93
  94   to_xhtml (FullPage hl as) =
  95     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  96     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  97     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  98     "<head>" ++
  99     "  <meta http-equiv=\"Content-Type\"" ++
 100     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
 101     "  <title>" ++ hl ++ "</title>" ++
 102     "</head>" ++
 103     "<body>" ++
 104     "<div>" ++
 105     "<h1>" ++ hl ++ "</h1>" ++
 106     (concatMap to_xhtml as) ++
 107     "</div>" ++
 108     "</body>" ++
 109     "</html>"
 110
 111
 112
 113 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 114 page_from_url cfg url = do
 115   contents <- get_article_contents cfg url
 116   case (xml_from_contents contents) of
 117     Just html -> parse cfg html
 118     Nothing -> return Nothing
 119
 120
 121
 122 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
 123 insert_full_stories story_map =
 124   processTopDown (article_xml `when` full_story_paragraph)
 125   where
 126     lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
 127     lookup_func href =
 128       case Map.lookup href story_map of
 129         -- Leave it alone if we don't have the full story.
 130         Nothing -> this
 131         Just v -> to_xml v
 132
 133     article_xml :: (ArrowXml a) => a XmlTree XmlTree
 134     article_xml =
 135       lookup_func
 136       $<
 137       (this /> full_story_link >>> getAttrValue "href")
 138
 139 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 140 replace_remote_img_srcs image_map =
 141   processTopDown (make_srcs_local `when` is_image)
 142   where
 143     -- old_src -> new_src
 144     change_src_func :: String -> String
 145     change_src_func old_src =
 146       case Map.lookup old_src image_map of
 147         -- Leave it alone if we don't have the file locally
 148         Nothing -> old_src
 149         Just v -> v
 150
 151     change_src :: (ArrowXml a) => a XmlTree XmlTree
 152     change_src =
 153       changeAttrValue change_src_func
 154
 155     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 156     make_srcs_local =
 157       processAttrl $ (change_src `when` (hasName "src"))
 158
 159
 160
 161
 162 -- Should be called *after* preprocessing.
 163 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 164 download_images xml = do
 165   image_urls <- runX $ xml >>> image_srcs
 166   download_image_urls image_urls
 167
 168
 169
 170 type StoryMap = Map.Map URL Article
 171
 172 -- These come *before* preprocessing.
 173 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
 174 download_full_story_urls cfg story_urls = do
 175   pages <- mapM (page_from_url cfg) story_urls
 176   let pairs = zip story_urls pages
 177   return $ foldl my_insert empty_map pairs
 178   where
 179     empty_map = Map.empty :: StoryMap
 180
 181     my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
 182     my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
 183     my_insert dict (_, _)  = dict
 184
 185
 186 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
 187 download_full_stories cfg xml = do
 188   story_urls <- runX $ xml >>> full_story_urls
 189   download_full_story_urls cfg story_urls
 190
 191
 192 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 193 parse cfg xml = do
 194   story_map <- download_full_stories cfg xml
 195   let fs_xml = xml >>> insert_full_stories story_map
 196
 197   let clean_xml = fs_xml >>> preprocess
 198   image_map <- download_images clean_xml
 199   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 200
 201   appr <- ap_parse local_xml
 202   fppr <- fp_parse local_xml
 203   return $
 204     if (isNothing appr) then
 205       fppr
 206     else
 207       appr
 208
 209
 210
 211 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 212 parse_headline xml = do
 213   let element_filter = xml >>> css "div.PageHeadline h1"
 214   let element_text_filter = element_filter /> getText
 215   element_text <- runX element_text_filter
 216   return $
 217     case element_text of
 218       [x] -> Just $ strip x
 219       []  -> Nothing
 220       _   -> error "Found more than one headline."
 221
 222
 223 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 224 parse_byline xml = do
 225   let element_filter = xml >>> css "div.FeatureByLine"
 226   let element_text_filter = element_filter /> getText
 227   element_text <- runX element_text_filter
 228   return $
 229     case element_text of
 230       [x] -> Just $ strip x
 231       []  -> Nothing
 232       _   -> error "Found more than one article byline."
 233
 234
 235 --
 236 -- ArticlePage Stuff
 237 --
 238 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 239 ap_parse xml = do
 240     arts <- ap_parse_articles xml
 241     case arts of
 242       [x] -> return $ Just $ ArticlePage x
 243       _   -> return Nothing
 244
 245
 246 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 247 ap_parse_body xml = do
 248   let element_filter = xml >>> css "div.ArticleText"
 249   let element_html_filter = xshow element_filter
 250   element_html <- runX element_html_filter
 251   return $ case element_html of
 252             [x] -> Just x
 253             []  -> Nothing
 254             _   -> error "Found more than one article."
 255
 256
 257 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 258 ap_parse_articles xml = do
 259   parsed_headline <- parse_headline xml
 260   parsed_byline   <- parse_byline xml
 261   parsed_body     <- ap_parse_body xml
 262
 263   putStrLn $ fromJust parsed_headline
 264
 265   if (isNothing parsed_headline) || (isNothing parsed_body)
 266   then return []
 267   else do
 268     let title'  = Title    $ fromJust parsed_headline
 269     let byline' = Byline     parsed_byline
 270     let body'   = BodyHtml $ fromJust parsed_body
 271
 272     return $ [Article title' byline' body']
 273
 274
 275
 276 --
 277 -- FullPage Stuff
 278 --
 279
 280 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 281 fp_parse xml = do
 282     hl <- parse_headline xml
 283     parsed_articles <- fp_parse_articles xml
 284     case parsed_articles of
 285       []          -> return Nothing
 286       x -> return $ Just $ FullPage (fromJust hl) x
 287
 288
 289
 290 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 291 fp_parse_article_title xml = do
 292   let element_filter = xml >>> css "h2.SummaryHL"
 293   let element_text_filter = element_filter //> getText
 294   element_text <- runX element_text_filter
 295   return $ case element_text of
 296             [x] -> Just $ strip x
 297             []  -> Nothing
 298             _   -> error "Found more than one article title."
 299
 300
 301
 302
 303 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 304 fp_parse_article_body xml = do
 305   -- First, delete the article title and byline.
 306   let clean_xml' = xml >>> remove_title >>> remove_byline
 307   -- The only child of the body element should be a div.lwn-article
 308   -- since we wrapped the article's HTML in that.
 309   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 310   clean_html <- runX . xshow $ clean_xml
 311   return $ case clean_html of
 312             [x] -> Just x
 313             []  -> Nothing
 314             _   -> error "Found more than one article body."
 315
 316 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 317 fp_parse_article xml = do
 318   parsed_article_title    <- fp_parse_article_title xml
 319   parsed_article_byline   <- parse_byline xml
 320   parsed_article_body     <- fp_parse_article_body xml
 321
 322   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 323   then
 324     return Nothing
 325   else do
 326     let title'   = Title    $ fromJust parsed_article_title
 327     let byline'  = Byline     parsed_article_byline
 328     let body'    = BodyHtml $ fromJust parsed_article_body
 329     return $ Just $ Article title' byline' body'
 330
 331 parse_html_article :: String -> IO (Maybe Article)
 332 parse_html_article html = do
 333   let xml = parseHtml $ wrap_in_body_div html
 334   fp_parse_article xml
 335
 336
 337 -- | In the full page, all of the article titles and bodies are
 338 --   wrapped in one big div.ArticleText.
 339 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 340 parse_bodies xml =
 341   xml >>> css "div.ArticleText"
 342
 343
 344 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 345 fp_parse_articles xml = do
 346   bodies <- runX . xshow $ parse_bodies xml
 347   let article_separator = "<h2 class=\"SummaryHL\">"
 348   let split_articles'' = split article_separator (concat bodies)
 349   -- The first element will contain the crap before the first <h2...>.
 350   let split_articles' = case split_articles'' of
 351                           (_:_) -> tail split_articles''
 352                           []    -> []
 353   -- Put the separator back, it was lost during the split.
 354   let split_articles = map (article_separator ++) split_articles'
 355   --_ <- mapM print_article split_articles
 356   real_articles <- mapM parse_html_article split_articles
 357   let just_articles = catMaybes real_articles
 358   return just_articles
 359
 360
 361 -- | This makes it easy to select otherwise-random chunks of html
 362 --   using 'css'.
 363 wrap_in_body_div :: String -> String
 364 wrap_in_body_div s =
 365   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 366
 367
 368
 369
 370 --
 371 -- Epublishable stuff
 372 --
 373
 374 title :: Page -> String
 375 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 376 title (FullPage hl _) = hl
 377
 378
 379 metadata :: Page -> IO String
 380 metadata obj = do
 381   date <- getCurrentTime
 382   return $
 383     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 384     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 385     "<dc:language>en-US</dc:language>\n" ++
 386     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 387     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 388
 389
 390 epublish :: Page -> Handle -> IO ()
 391 epublish obj handle = do
 392   let xhtml = to_xhtml obj
 393   epmd <- metadata obj
 394   epub <- xhtml_to_epub epmd xhtml
 395   B.hPut handle epub
 396   hFlush handle
 397   hClose handle
 398
 399 xhtml_to_epub :: String -> String -> IO B.ByteString
 400 xhtml_to_epub epmd =
 401    write_epub . read_html
 402    where
 403      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 404      write_epub = writeEPUB Nothing [] my_writer_options
 405      read_html  = readHtml defaultParserState
 406
 407
 408 --
 409 -- Tests
 410 --
 411
 412 test_preprocess_links :: Assertion
 413 test_preprocess_links = do
 414   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 415   let actual_xml = actual_xml' !! 0
 416
 417   expected_xml' <- runX $ expected_xml'' >>> css "body"
 418   let expected_xml = expected_xml' !! 0
 419
 420   assertEqual
 421     "Links are replaced with spans"
 422     expected_xml
 423     actual_xml
 424   where
 425     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 426     input_xml  = parseHtml input_html
 427     expected_html = "<body><span>Hello, world!</span></body>"
 428     expected_xml'' = parseHtml expected_html
 429
 430
 431 test_absolve_images :: Assertion
 432 test_absolve_images = do
 433   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 434   let actual_xml = actual_xml' !! 0
 435
 436   expected_xml' <- runX $ expected_xml'' >>> css "body"
 437   let expected_xml = expected_xml' !! 0
 438
 439   assertEqual
 440     "Image srcs are made absolute"
 441     expected_xml
 442     actual_xml
 443   where
 444     input_html =
 445       "<body>" ++
 446       "<img src=\"/images/2012/example.jpg\" />" ++
 447       "</body>"
 448     input_xml  = parseHtml input_html
 449     expected_html =
 450       "<body>" ++
 451       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 452       "</body>"
 453     expected_xml'' = parseHtml expected_html
 454
 455
 456 test_comments_removed :: Assertion
 457 test_comments_removed = do
 458   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 459   let actual_xml = actual_xml' !! 0
 460
 461   expected_xml' <- runX $ expected_xml'' >>> css "body"
 462   let expected_xml = expected_xml' !! 0
 463
 464   assertEqual
 465     "Comment links are removed"
 466     expected_xml
 467     actual_xml
 468   where
 469     input_html =
 470       "<body><p>" ++
 471       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 472       "</p></body>"
 473     input_xml  = parseHtml input_html
 474
 475     expected_html  = "<body><p></p></body>"
 476     expected_xml'' = parseHtml expected_html
 477
 478
 479 test_full_story_urls_parsed :: Assertion
 480 test_full_story_urls_parsed = do
 481   actual <- runX $ actual'
 482
 483   assertEqual
 484     "Full Story URLs are parsed"
 485     expected
 486     actual
 487   where
 488     expected = ["/Articles/500738/", "/Articles/501837/"]
 489
 490     full_story_html =
 491       concat ["<p>",
 492               "<a href=\"/Articles/500738/\">Full Story</a> ",
 493               "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
 494               "<p>",
 495               "<a href=\"/Articles/501837/\">Full Story</a> ",
 496               "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
 497               "<p>"]
 498
 499     xml = parseHtml full_story_html
 500     actual' = xml >>> full_story_urls
 501
 502 page_tests :: Test
 503 page_tests =
 504   testGroup "Page Tests" [
 505     testCase "Links are replaced with spans" test_preprocess_links,
 506     testCase "Image srcs are made absolute" test_absolve_images,
 507     testCase "Comment links are removed" test_comments_removed,
 508     testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]