src/LWN/Page.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 module LWN.Page
   4 where
   5
   6 import qualified Data.Map as Map (lookup)
   7 import Data.Time (getCurrentTime)
   8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
   9 import Data.String.Utils (split, strip)
  10 import Data.Maybe (catMaybes, fromJust, isNothing)
  11 import Prelude hiding (readFile)
  12 import System.IO (Handle, hClose, hFlush)
  13 import Test.HUnit (Assertion, assertEqual)
  14 import Test.Framework (Test, testGroup)
  15 import Test.Framework.Providers.HUnit (testCase)
  16 import Text.Pandoc (
  17   defaultParserState,
  18   defaultWriterOptions,
  19   readHtml,
  20   writeEPUB,
  21   writerEPUBMetadata)
  22 import Text.XML.HXT.Core (
  23   ArrowXml,
  24   IOSArrow,
  25   XmlTree,
  26   (>>>),
  27   (/>),
  28   (//>),
  29   changeAttrValue,
  30   getChildren,
  31   getText,
  32   hasName,
  33   processAttrl,
  34   processTopDown,
  35   runX,
  36   xshow,
  37   when)
  38 import Text.HandsomeSoup (css, parseHtml)
  39
  40 import Configuration (Cfg)
  41 import LWN.Article
  42 import LWN.HTTP (
  43   ImageMap,
  44   download_image_urls,
  45   get_article_contents)
  46 import LWN.URI (URL)
  47 import LWN.XHTML (
  48   XHTML,
  49   image_srcs,
  50   is_image,
  51   preprocess,
  52   remove_byline,
  53   remove_title,
  54   to_xhtml,
  55   xml_from_contents)
  56
  57
  58
  59
  60 data Page =
  61   -- | An LWN page with one article on it.
  62   ArticlePage { article :: Article } |
  63
  64   -- | An LWN page with more than one article on it. These require
  65   --   different parsing and display functions than the single-article
  66   --   pages.
  67   FullPage { headline :: String,
  68              articles :: [Article] }
  69
  70
  71 instance XHTML Page where
  72   to_xhtml (ArticlePage a) =
  73     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  74     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  75     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  76     "<head>" ++
  77     "  <meta http-equiv=\"Content-Type\"" ++
  78     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  79     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  80     "</head>" ++
  81     "<body>" ++
  82     (to_xhtml a) ++
  83     "</body>" ++
  84     "</html>"
  85
  86   to_xhtml (FullPage hl as) =
  87     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  88     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  89     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  90     "<head>" ++
  91     "  <meta http-equiv=\"Content-Type\"" ++
  92     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  93     "  <title>" ++ hl ++ "</title>" ++
  94     "</head>" ++
  95     "<body>" ++
  96     "<div>" ++
  97     "<h1>" ++ hl ++ "</h1>" ++
  98     (concatMap to_xhtml as) ++
  99     "</div>" ++
 100     "</body>" ++
 101     "</html>"
 102
 103
 104
 105 page_from_url :: Cfg -> URL -> IO (Maybe Page)
 106 page_from_url cfg url = do
 107   contents <- get_article_contents cfg url
 108   case (xml_from_contents contents) of
 109     Just html -> parse html
 110     Nothing -> return Nothing
 111
 112
 113
 114 -- Should be called *after* preprocessing.
 115 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
 116 download_images xml = do
 117   image_urls <- runX $ xml >>> image_srcs
 118   download_image_urls image_urls
 119
 120
 121
 122 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 123 replace_remote_img_srcs image_map =
 124   processTopDown (make_srcs_local `when` is_image)
 125   where
 126     -- old_src -> new_src
 127     change_src_func :: String -> String
 128     change_src_func old_src =
 129       case Map.lookup old_src image_map of
 130         -- Leave it alone if we don't have the file locally
 131         Nothing -> old_src
 132         Just v -> v
 133
 134     change_src :: (ArrowXml a) => a XmlTree XmlTree
 135     change_src =
 136       changeAttrValue change_src_func
 137
 138     make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
 139     make_srcs_local =
 140       processAttrl $ (change_src `when` (hasName "src"))
 141
 142
 143 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 144 parse xml = do
 145   let clean_xml = xml >>> preprocess
 146   image_map <- download_images clean_xml
 147   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
 148   appr <- ap_parse local_xml
 149   fppr <- fp_parse local_xml
 150   return $
 151     if (isNothing appr) then
 152       fppr
 153     else
 154       appr
 155
 156
 157
 158 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 159 parse_headline xml = do
 160   let element_filter = xml >>> css "div.PageHeadline h1"
 161   let element_text_filter = element_filter /> getText
 162   element_text <- runX element_text_filter
 163   return $
 164     case element_text of
 165       [x] -> Just $ strip x
 166       []  -> Nothing
 167       _   -> error "Found more than one headline."
 168
 169
 170 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 171 parse_byline xml = do
 172   let element_filter = xml >>> css "div.FeatureByLine"
 173   let element_text_filter = element_filter /> getText
 174   element_text <- runX element_text_filter
 175   return $
 176     case element_text of
 177       [x] -> Just $ strip x
 178       []  -> Nothing
 179       _   -> error "Found more than one article byline."
 180
 181
 182 --
 183 -- ArticlePage Stuff
 184 --
 185 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 186 ap_parse xml = do
 187     arts <- ap_parse_articles xml
 188     case arts of
 189       [x] -> return $ Just $ ArticlePage x
 190       _   -> return Nothing
 191
 192
 193 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 194 ap_parse_body xml = do
 195   let element_filter = xml >>> css "div.ArticleText"
 196   let element_html_filter = xshow element_filter
 197   element_html <- runX element_html_filter
 198   return $ case element_html of
 199             [x] -> Just x
 200             []  -> Nothing
 201             _   -> error "Found more than one article."
 202
 203
 204 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 205 ap_parse_articles xml = do
 206   parsed_headline <- parse_headline xml
 207   parsed_byline   <- parse_byline xml
 208   parsed_body     <- ap_parse_body xml
 209
 210   putStrLn $ fromJust parsed_headline
 211
 212   if (isNothing parsed_headline) || (isNothing parsed_body)
 213   then return []
 214   else do
 215     let title'  = Title    $ fromJust parsed_headline
 216     let byline' = Byline     parsed_byline
 217     let body'   = BodyHtml $ fromJust parsed_body
 218
 219     return $ [Article title' byline' body']
 220
 221
 222
 223 --
 224 -- FullPage Stuff
 225 --
 226
 227 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 228 fp_parse xml = do
 229     hl <- parse_headline xml
 230     parsed_articles <- fp_parse_articles xml
 231     case parsed_articles of
 232       []          -> return Nothing
 233       x -> return $ Just $ FullPage (fromJust hl) x
 234
 235
 236
 237 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 238 fp_parse_article_title xml = do
 239   let element_filter = xml >>> css "h2.SummaryHL"
 240   let element_text_filter = element_filter //> getText
 241   element_text <- runX element_text_filter
 242   return $ case element_text of
 243             [x] -> Just $ strip x
 244             []  -> Nothing
 245             _   -> error "Found more than one article title."
 246
 247
 248
 249
 250 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 251 fp_parse_article_body xml = do
 252   -- First, delete the article title and byline.
 253   let clean_xml' = xml >>> remove_title >>> remove_byline
 254   -- The only child of the body element should be a div.lwn-article
 255   -- since we wrapped the article's HTML in that.
 256   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 257   clean_html <- runX . xshow $ clean_xml
 258   return $ case clean_html of
 259             [x] -> Just x
 260             []  -> Nothing
 261             _   -> error "Found more than one article body."
 262
 263 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 264 fp_parse_article xml = do
 265   parsed_article_title    <- fp_parse_article_title xml
 266   parsed_article_byline   <- parse_byline xml
 267   parsed_article_body     <- fp_parse_article_body xml
 268
 269   if (isNothing parsed_article_title) || (isNothing parsed_article_body)
 270   then
 271     return Nothing
 272   else do
 273     let title'   = Title    $ fromJust parsed_article_title
 274     let byline'  = Byline     parsed_article_byline
 275     let body'    = BodyHtml $ fromJust parsed_article_body
 276     return $ Just $ Article title' byline' body'
 277
 278 parse_html_article :: String -> IO (Maybe Article)
 279 parse_html_article html = do
 280   let xml = parseHtml $ wrap_in_body_div html
 281   fp_parse_article xml
 282
 283
 284 -- | In the full page, all of the article titles and bodies are
 285 --   wrapped in one big div.ArticleText.
 286 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 287 parse_bodies xml =
 288   xml >>> css "div.ArticleText"
 289
 290
 291 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 292 fp_parse_articles xml = do
 293   bodies <- runX . xshow $ parse_bodies xml
 294   let article_separator = "<h2 class=\"SummaryHL\">"
 295   let split_articles'' = split article_separator (concat bodies)
 296   -- The first element will contain the crap before the first <h2...>.
 297   let split_articles' = case split_articles'' of
 298                           (_:_) -> tail split_articles''
 299                           []    -> []
 300   -- Put the separator back, it was lost during the split.
 301   let split_articles = map (article_separator ++) split_articles'
 302   --_ <- mapM print_article split_articles
 303   real_articles <- mapM parse_html_article split_articles
 304   let just_articles = catMaybes real_articles
 305   return just_articles
 306
 307
 308 -- | This makes it easy to select otherwise-random chunks of html
 309 --   using 'css'.
 310 wrap_in_body_div :: String -> String
 311 wrap_in_body_div s =
 312   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 313
 314
 315
 316
 317 --
 318 -- Epublishable stuff
 319 --
 320
 321 title :: Page -> String
 322 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 323 title (FullPage hl _) = hl
 324
 325
 326 metadata :: Page -> IO String
 327 metadata obj = do
 328   date <- getCurrentTime
 329   return $
 330     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 331     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 332     "<dc:language>en-US</dc:language>\n" ++
 333     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 334     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 335
 336
 337 epublish :: Page -> Handle -> IO ()
 338 epublish obj handle = do
 339   let xhtml = to_xhtml obj
 340   epmd <- metadata obj
 341   epub <- xhtml_to_epub epmd xhtml
 342   B.hPut handle epub
 343   hFlush handle
 344   hClose handle
 345
 346 xhtml_to_epub :: String -> String -> IO B.ByteString
 347 xhtml_to_epub epmd =
 348    write_epub . read_html
 349    where
 350      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 351      write_epub = writeEPUB Nothing [] my_writer_options
 352      read_html  = readHtml defaultParserState
 353
 354
 355 --
 356 -- Tests
 357 --
 358
 359 test_preprocess_links :: Assertion
 360 test_preprocess_links = do
 361   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 362   let actual_xml = actual_xml' !! 0
 363
 364   expected_xml' <- runX $ expected_xml'' >>> css "body"
 365   let expected_xml = expected_xml' !! 0
 366
 367   assertEqual
 368     "Links are replaced with spans"
 369     expected_xml
 370     actual_xml
 371   where
 372     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 373     input_xml  = parseHtml input_html
 374     expected_html = "<body><span>Hello, world!</span></body>"
 375     expected_xml'' = parseHtml expected_html
 376
 377
 378 test_absolve_images :: Assertion
 379 test_absolve_images = do
 380   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 381   let actual_xml = actual_xml' !! 0
 382
 383   expected_xml' <- runX $ expected_xml'' >>> css "body"
 384   let expected_xml = expected_xml' !! 0
 385
 386   assertEqual
 387     "Image srcs are made absolute"
 388     expected_xml
 389     actual_xml
 390   where
 391     input_html =
 392       "<body>" ++
 393       "<img src=\"/images/2012/example.jpg\" />" ++
 394       "</body>"
 395     input_xml  = parseHtml input_html
 396     expected_html =
 397       "<body>" ++
 398       "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
 399       "</body>"
 400     expected_xml'' = parseHtml expected_html
 401
 402
 403 test_comments_removed :: Assertion
 404 test_comments_removed = do
 405   actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
 406   let actual_xml = actual_xml' !! 0
 407
 408   expected_xml' <- runX $ expected_xml'' >>> css "body"
 409   let expected_xml = expected_xml' !! 0
 410
 411   assertEqual
 412     "Comment links are removed"
 413     expected_xml
 414     actual_xml
 415   where
 416     input_html =
 417       "<body><p>" ++
 418       "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
 419       "</p></body>"
 420     input_xml  = parseHtml input_html
 421
 422     expected_html  = "<body><p></p></body>"
 423     expected_xml'' = parseHtml expected_html
 424
 425
 426
 427 page_tests :: Test
 428 page_tests =
 429   testGroup "Page Tests" [
 430     testCase "Links are replaced with spans" test_preprocess_links,
 431     testCase "Image srcs are made absolute" test_absolve_images,
 432     testCase "Comment links are removed" test_comments_removed ]