src/LWN/Page.hs

   1 module LWN.Page
   2 where
   3
   4 import Text.Pandoc
   5 import Data.Time (getCurrentTime)
   6 import System.IO (Handle)
   7 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
   8 import Data.String.Utils (split, strip)
   9 import Data.Maybe (catMaybes, fromJust, isNothing)
  10 import Data.Tree.NTree.TypeDefs (NTree)
  11 import Test.HUnit (Assertion, assertEqual)
  12 import Test.Framework (Test, testGroup)
  13 import Test.Framework.Providers.HUnit (testCase)
  14 import Text.XML.HXT.Core (
  15   ArrowXml,
  16   IOSArrow,
  17   XmlTree,
  18   XNode,
  19   (>>>),
  20   (/>),
  21   (//>),
  22   getChildren,
  23   getText,
  24   hasAttrValue,
  25   hasName,
  26   mkName,
  27   none,
  28   processAttrl,
  29   processTopDown,
  30   runX,
  31   setElemName,
  32   xshow,
  33   when
  34   )
  35 import Text.HandsomeSoup (css, parseHtml)
  36
  37 import LWN.Article
  38 import XHTML
  39
  40 data Page =
  41   -- | An LWN page with one article on it.
  42   ArticlePage { article :: Article } |
  43
  44   -- | An LWN page with more than one article on it. These require
  45   --   different parsing and display functions than the single-article
  46   --   pages.
  47   FullPage { headline :: String,
  48              articles :: [Article] }
  49
  50
  51 instance XHTML Page where
  52   to_xhtml (ArticlePage a) =
  53     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  54     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  55     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  56     "<head>" ++
  57     "  <meta http-equiv=\"Content-Type\"" ++
  58     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  59     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  60     "</head>" ++
  61     "<body>" ++
  62     "<div>" ++
  63     (to_xhtml a) ++
  64     "</div>" ++
  65     "</body>" ++
  66     "</html>"
  67
  68   to_xhtml (FullPage hl as) =
  69     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  70     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  71     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  72     "<head>" ++
  73     "  <meta http-equiv=\"Content-Type\"" ++
  74     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  75     "  <title>" ++ hl ++ "</title>" ++
  76     "</head>" ++
  77     "<body>" ++
  78     "<div>" ++
  79     "<h1>" ++ hl ++ "</h1>" ++
  80     (concatMap to_xhtml as) ++
  81     "</div>" ++
  82     "</body>" ++
  83     "</html>"
  84
  85
  86
  87 remove_images :: (ArrowXml a) => a XmlTree XmlTree
  88 remove_images =
  89   processTopDown ((none) `when` is_image)
  90
  91
  92 is_link :: (ArrowXml a) => a XmlTree XmlTree
  93 is_link =
  94   hasName "a"
  95
  96 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
  97 replace_links_with_spans =
  98   processTopDown $ (make_span >>> remove_attrs) `when` is_link
  99   where
 100     make_span    = setElemName $ mkName "span"
 101     remove_attrs = processAttrl none
 102
 103 -- | Preprocessing common to both page types.
 104 preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
 105 preprocess xml =
 106   xml >>> remove_images >>> replace_links_with_spans
 107
 108
 109 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
 110 parse xml = do
 111   let clean_xml = preprocess xml
 112   appr <- ap_parse clean_xml
 113   fppr <- fp_parse clean_xml
 114   return $
 115     if (isNothing appr) then
 116       fppr
 117     else
 118       appr
 119
 120 --
 121 -- ArticlePage Stuff
 122 --
 123 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
 124 ap_parse xml = do
 125     arts <- ap_parse_articles xml
 126     case arts of
 127       Just [x] -> return $ Just $ ArticlePage x
 128       _   -> return Nothing
 129
 130
 131 ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 132 ap_parse_headline xml = do
 133   let element_filter = xml >>> css "div.PageHeadline h1"
 134   let element_text_filter = element_filter /> getText
 135   element_text <- runX element_text_filter
 136   return $ case element_text of
 137             [x] -> Just $ strip x
 138             []  -> Nothing
 139             _   -> error "Found more than one headline."
 140
 141 ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 142 ap_parse_byline xml = do
 143   let element_filter = xml >>> css "div.Byline"
 144   let element_text_filter = element_filter /> getText
 145   element_text <- runX element_text_filter
 146   return $ case element_text of
 147             [x] -> Just $ strip x
 148             []  -> Nothing
 149             _   -> error "Found more than one byline."
 150
 151
 152 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 153 ap_parse_body xml = do
 154   let element_filter = xml >>> css "div.ArticleText"
 155   let element_html_filter = xshow element_filter
 156   element_html <- runX element_html_filter
 157   return $ case element_html of
 158             [x] -> Just x
 159             []  -> Nothing
 160             _   -> error "Found more than one article."
 161
 162
 163 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
 164 ap_parse_articles xml = do
 165   parsed_headline <- ap_parse_headline xml
 166   parsed_byline   <- ap_parse_byline xml
 167   parsed_body     <- ap_parse_body xml
 168   let title'   = Title (fromJust parsed_headline)
 169   let byline' = Byline  parsed_byline
 170   let body'   = BodyHtml (fromJust parsed_body)
 171   return $ Just $ [Article title' byline' body']
 172
 173
 174
 175
 176 --
 177 -- FullPage Stuff
 178 --
 179
 180 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
 181 fp_parse xml = do
 182     hl <- fp_parse_headline xml
 183     parsed_articles <- fp_parse_articles xml
 184     case parsed_articles of
 185       them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
 186       _          -> return Nothing
 187
 188
 189
 190
 191 fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 192 fp_parse_headline xml = do
 193   let element_filter = xml >>> css "div.PageHeadline h1"
 194   let element_text_filter = element_filter /> getText
 195   element_text <- runX element_text_filter
 196   return $ case element_text of
 197             [x] -> Just $ strip x
 198             []  -> Nothing
 199             _   -> error "Found more than one headline."
 200
 201 fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 202 fp_parse_article_byline xml = do
 203   let element_filter = xml >>> css "div.FeatureByLine"
 204   let element_text_filter = element_filter /> getText
 205   element_text <- runX element_text_filter
 206   return $ case element_text of
 207             [x] -> Just $ strip x
 208             []  -> Nothing
 209             _   -> error "Found more than one article byline."
 210
 211
 212 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 213 fp_parse_article_title xml = do
 214   let element_filter = xml >>> css "h2.SummaryHL"
 215   let element_text_filter = element_filter //> getText
 216   element_text <- runX element_text_filter
 217   return $ case element_text of
 218             [x] -> Just $ strip x
 219             []  -> Nothing
 220             _   -> error "Found more than one article title."
 221
 222
 223
 224 is_title :: (ArrowXml a) => a XmlTree XmlTree
 225 is_title =
 226   (hasName "h2")
 227   >>>
 228   (hasAttrValue "class" (== "SummaryHL"))
 229
 230
 231 is_byline :: (ArrowXml a) => a XmlTree XmlTree
 232 is_byline =
 233   (hasName "div")
 234   >>>
 235   (hasAttrValue "class" (== "FeatureByLine"))
 236
 237
 238 is_image :: (ArrowXml a) => a XmlTree XmlTree
 239 is_image =
 240   hasName "img"
 241
 242 remove_title :: (ArrowXml a) => a XmlTree XmlTree
 243 remove_title =
 244   processTopDown ((none) `when` is_title)
 245
 246
 247 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
 248 remove_byline =
 249   processTopDown ((none) `when` is_byline)
 250
 251
 252
 253 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 254 fp_parse_article_body xml = do
 255   -- First, delete the article title and byline.
 256   let clean_xml' = xml >>> remove_title >>> remove_byline
 257   -- The only child of the body element should be a div.lwn-article
 258   -- since we wrapped the article's HTML in that.
 259   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 260   clean_html <- runX . xshow $ clean_xml
 261   return $ case clean_html of
 262             [x] -> Just x
 263             []  -> Nothing
 264             _   -> error "Found more than one article body."
 265
 266 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
 267 fp_parse_article xml = do
 268   parsed_article_title    <- fp_parse_article_title xml
 269   parsed_article_byline   <- fp_parse_article_byline xml
 270   parsed_article_body     <- fp_parse_article_body xml
 271   let title'   = Title    $ fromJust parsed_article_title
 272   let byline'  = Byline     parsed_article_byline
 273   let body'    = BodyHtml $ fromJust parsed_article_body
 274   return $ Just $ Article title' byline' body'
 275
 276 parse_html_article :: String -> IO (Maybe Article)
 277 parse_html_article html = do
 278   let xml = parseHtml $ wrap_in_body_div html
 279   fp_parse_article xml
 280
 281
 282 -- | In the full page, all of the article titles and bodies are
 283 --   wrapped in a div.ArticleText.
 284 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
 285 parse_bodies xml =
 286   xml >>> css "div.ArticleText"
 287
 288
 289 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
 290 fp_parse_articles xml = do
 291   bodies <- runX . xshow $ parse_bodies xml
 292   let article_separator = "<h2 class=\"SummaryHL\">"
 293   let split_articles'' = split article_separator (concat bodies)
 294   -- The first element will contain the crap before the first <h2...>.
 295   let split_articles' = tail split_articles''
 296   -- Put the separator back, it was lost during the split.
 297   let split_articles = map (article_separator ++) split_articles'
 298   --_ <- mapM print_article split_articles
 299   real_articles <- mapM parse_html_article split_articles
 300   let just_articles = catMaybes real_articles
 301   return just_articles
 302
 303
 304 wrap_in_body_div :: String -> String
 305 wrap_in_body_div s =
 306   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 307
 308
 309
 310
 311 --
 312 -- Epublishable stuff
 313 --
 314
 315 title :: Page -> String
 316 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 317 title (FullPage hl _) = hl
 318
 319
 320 metadata :: Page -> IO String
 321 metadata obj = do
 322   date <- getCurrentTime
 323   return $
 324     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 325     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 326     "<dc:language>en-US</dc:language>\n" ++
 327     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 328     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 329
 330
 331 epublish :: Page -> Handle -> IO ()
 332 epublish obj handle = do
 333   let xhtml = to_xhtml obj
 334   epmd <- metadata obj
 335   epub <- xhtml_to_epub epmd xhtml
 336   B.hPut handle epub
 337
 338
 339 xhtml_to_epub :: String -> String -> IO B.ByteString
 340 xhtml_to_epub epmd =
 341    write_epub . read_html
 342    where
 343      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 344      write_epub = writeEPUB Nothing [] my_writer_options
 345      read_html  = readHtml defaultParserState
 346
 347
 348
 349
 350 test_preprocess_links :: Assertion
 351 test_preprocess_links = do
 352   actual_xml' <- runX $ (preprocess input_xml) >>> css "body"
 353   let actual_xml = actual_xml' !! 0
 354
 355   expected_xml' <- runX $ expected_xml'' >>> css "body"
 356   let expected_xml = expected_xml' !! 0
 357
 358   assertEqual
 359     "Links are replaced with spans"
 360     expected_xml
 361     actual_xml
 362   where
 363     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 364     input_xml  = parseHtml input_html
 365     expected_html = "<body><span>Hello, world!</span></body>"
 366     expected_xml'' = parseHtml expected_html
 367
 368 page_tests :: Test
 369 page_tests =
 370   testGroup "Page Tests" [
 371     testCase "Links are replaced with spans" test_preprocess_links ]