src/LWN/Page.hs

   1 module LWN.Page
   2 where
   3
   4 import Text.Pandoc
   5 import Data.Time (getCurrentTime)
   6 import System.IO (Handle)
   7 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
   8 import Data.String.Utils (split, strip)
   9 import Data.Maybe (catMaybes, fromJust, isNothing)
  10 import Data.Tree.NTree.TypeDefs (NTree)
  11 import Test.HUnit (Assertion, assertEqual)
  12 import Test.Framework (Test, testGroup)
  13 import Test.Framework.Providers.HUnit (testCase)
  14 import Text.XML.HXT.Core (
  15   ArrowXml,
  16   IOSArrow,
  17   XmlTree,
  18   XNode,
  19   (>>>),
  20   (/>),
  21   (//>),
  22   getChildren,
  23   getText,
  24   hasAttrValue,
  25   hasName,
  26   mkName,
  27   none,
  28   processAttrl,
  29   processTopDown,
  30   runX,
  31   setElemName,
  32   xshow,
  33   when
  34   )
  35 import Text.HandsomeSoup (css, parseHtml)
  36
  37 import LWN.Article
  38 import XHTML
  39
  40 data Page =
  41   -- | An LWN page with one article on it.
  42   ArticlePage { article :: Article } |
  43
  44   -- | An LWN page with more than one article on it. These require
  45   --   different parsing and display functions than the single-article
  46   --   pages.
  47   FullPage { headline :: String,
  48              articles :: [Article] }
  49
  50
  51 instance XHTML Page where
  52   to_xhtml (ArticlePage a) =
  53     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  54     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  55     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  56     "<head>" ++
  57     "  <meta http-equiv=\"Content-Type\"" ++
  58     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  59     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  60     "</head>" ++
  61     "<body>" ++
  62     "<div>" ++
  63     (to_xhtml a) ++
  64     "</div>" ++
  65     "</body>" ++
  66     "</html>"
  67
  68   to_xhtml (FullPage hl as) =
  69     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  70     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  71     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  72     "<head>" ++
  73     "  <meta http-equiv=\"Content-Type\"" ++
  74     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  75     "  <title>" ++ hl ++ "</title>" ++
  76     "</head>" ++
  77     "<body>" ++
  78     "<div>" ++
  79     "<h1>" ++ hl ++ "</h1>" ++
  80     (concatMap to_xhtml as) ++
  81     "</div>" ++
  82     "</body>" ++
  83     "</html>"
  84
  85
  86
  87 remove_images :: (ArrowXml a) => a XmlTree XmlTree
  88 remove_images =
  89   processTopDown ((none) `when` is_image)
  90
  91
  92 is_link :: (ArrowXml a) => a XmlTree XmlTree
  93 is_link =
  94   hasName "a"
  95
  96 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
  97 replace_links_with_spans =
  98   processTopDown $ (make_span >>> remove_attrs) `when` is_link
  99   where
 100     make_span    = setElemName $ mkName "span"
 101     remove_attrs = processAttrl none
 102
 103 -- | Preprocessing common to both page types.
 104 preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
 105 preprocess xml =
 106   xml >>> remove_images >>> replace_links_with_spans
 107
 108
 109 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
 110 parse xml = do
 111   let clean_xml = preprocess xml
 112   appr <- ap_parse clean_xml
 113   fppr <- fp_parse clean_xml
 114   return $
 115     if (isNothing appr) then
 116       fppr
 117     else
 118       appr
 119
 120
 121
 122 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 123 parse_headline xml = do
 124   let element_filter = xml >>> css "div.PageHeadline h1"
 125   let element_text_filter = element_filter /> getText
 126   element_text <- runX element_text_filter
 127   return $
 128     case element_text of
 129       [x] -> Just $ strip x
 130       []  -> Nothing
 131       _   -> error "Found more than one headline."
 132
 133
 134 parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 135 parse_byline xml = do
 136   let element_filter = xml >>> css "div.FeatureByLine"
 137   let element_text_filter = element_filter /> getText
 138   element_text <- runX element_text_filter
 139   return $
 140     case element_text of
 141       [x] -> Just $ strip x
 142       []  -> Nothing
 143       _   -> error "Found more than one article byline."
 144
 145
 146 --
 147 -- ArticlePage Stuff
 148 --
 149 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
 150 ap_parse xml = do
 151     arts <- ap_parse_articles xml
 152     case arts of
 153       Just [x] -> return $ Just $ ArticlePage x
 154       _   -> return Nothing
 155
 156
 157 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 158 ap_parse_body xml = do
 159   let element_filter = xml >>> css "div.ArticleText"
 160   let element_html_filter = xshow element_filter
 161   element_html <- runX element_html_filter
 162   return $ case element_html of
 163             [x] -> Just x
 164             []  -> Nothing
 165             _   -> error "Found more than one article."
 166
 167
 168 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
 169 ap_parse_articles xml = do
 170   parsed_headline <- parse_headline xml
 171   parsed_byline   <- parse_byline xml
 172   parsed_body     <- ap_parse_body xml
 173   let title'   = Title (fromJust parsed_headline)
 174   let byline' = Byline  parsed_byline
 175   let body'   = BodyHtml (fromJust parsed_body)
 176   return $ Just $ [Article title' byline' body']
 177
 178
 179
 180
 181 --
 182 -- FullPage Stuff
 183 --
 184
 185 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
 186 fp_parse xml = do
 187     hl <- parse_headline xml
 188     parsed_articles <- fp_parse_articles xml
 189     case parsed_articles of
 190       them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
 191       _          -> return Nothing
 192
 193
 194
 195 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 196 fp_parse_article_title xml = do
 197   let element_filter = xml >>> css "h2.SummaryHL"
 198   let element_text_filter = element_filter //> getText
 199   element_text <- runX element_text_filter
 200   return $ case element_text of
 201             [x] -> Just $ strip x
 202             []  -> Nothing
 203             _   -> error "Found more than one article title."
 204
 205
 206
 207 is_title :: (ArrowXml a) => a XmlTree XmlTree
 208 is_title =
 209   (hasName "h2")
 210   >>>
 211   (hasAttrValue "class" (== "SummaryHL"))
 212
 213
 214 is_byline :: (ArrowXml a) => a XmlTree XmlTree
 215 is_byline =
 216   (hasName "div")
 217   >>>
 218   (hasAttrValue "class" (== "FeatureByLine"))
 219
 220
 221 is_image :: (ArrowXml a) => a XmlTree XmlTree
 222 is_image =
 223   hasName "img"
 224
 225 remove_title :: (ArrowXml a) => a XmlTree XmlTree
 226 remove_title =
 227   processTopDown ((none) `when` is_title)
 228
 229
 230 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
 231 remove_byline =
 232   processTopDown ((none) `when` is_byline)
 233
 234
 235
 236 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 237 fp_parse_article_body xml = do
 238   -- First, delete the article title and byline.
 239   let clean_xml' = xml >>> remove_title >>> remove_byline
 240   -- The only child of the body element should be a div.lwn-article
 241   -- since we wrapped the article's HTML in that.
 242   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 243   clean_html <- runX . xshow $ clean_xml
 244   return $ case clean_html of
 245             [x] -> Just x
 246             []  -> Nothing
 247             _   -> error "Found more than one article body."
 248
 249 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
 250 fp_parse_article xml = do
 251   parsed_article_title    <- fp_parse_article_title xml
 252   parsed_article_byline   <- parse_byline xml
 253   parsed_article_body     <- fp_parse_article_body xml
 254   let title'   = Title    $ fromJust parsed_article_title
 255   let byline'  = Byline     parsed_article_byline
 256   let body'    = BodyHtml $ fromJust parsed_article_body
 257   return $ Just $ Article title' byline' body'
 258
 259 parse_html_article :: String -> IO (Maybe Article)
 260 parse_html_article html = do
 261   let xml = parseHtml $ wrap_in_body_div html
 262   fp_parse_article xml
 263
 264
 265 -- | In the full page, all of the article titles and bodies are
 266 --   wrapped in one big div.ArticleText.
 267 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
 268 parse_bodies xml =
 269   xml >>> css "div.ArticleText"
 270
 271
 272 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
 273 fp_parse_articles xml = do
 274   bodies <- runX . xshow $ parse_bodies xml
 275   let article_separator = "<h2 class=\"SummaryHL\">"
 276   let split_articles'' = split article_separator (concat bodies)
 277   -- The first element will contain the crap before the first <h2...>.
 278   let split_articles' = tail split_articles''
 279   -- Put the separator back, it was lost during the split.
 280   let split_articles = map (article_separator ++) split_articles'
 281   --_ <- mapM print_article split_articles
 282   real_articles <- mapM parse_html_article split_articles
 283   let just_articles = catMaybes real_articles
 284   return just_articles
 285
 286
 287 -- | This makes it easy to select otherwise-random chunks of html
 288 --   using 'css'.
 289 wrap_in_body_div :: String -> String
 290 wrap_in_body_div s =
 291   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 292
 293
 294
 295
 296 --
 297 -- Epublishable stuff
 298 --
 299
 300 title :: Page -> String
 301 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 302 title (FullPage hl _) = hl
 303
 304
 305 metadata :: Page -> IO String
 306 metadata obj = do
 307   date <- getCurrentTime
 308   return $
 309     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 310     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 311     "<dc:language>en-US</dc:language>\n" ++
 312     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 313     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 314
 315
 316 epublish :: Page -> Handle -> IO ()
 317 epublish obj handle = do
 318   let xhtml = to_xhtml obj
 319   epmd <- metadata obj
 320   epub <- xhtml_to_epub epmd xhtml
 321   B.hPut handle epub
 322
 323
 324 xhtml_to_epub :: String -> String -> IO B.ByteString
 325 xhtml_to_epub epmd =
 326    write_epub . read_html
 327    where
 328      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 329      write_epub = writeEPUB Nothing [] my_writer_options
 330      read_html  = readHtml defaultParserState
 331
 332
 333
 334
 335
 336 --
 337 -- Tests
 338 --
 339
 340 test_preprocess_links :: Assertion
 341 test_preprocess_links = do
 342   actual_xml' <- runX $ (preprocess input_xml) >>> css "body"
 343   let actual_xml = actual_xml' !! 0
 344
 345   expected_xml' <- runX $ expected_xml'' >>> css "body"
 346   let expected_xml = expected_xml' !! 0
 347
 348   assertEqual
 349     "Links are replaced with spans"
 350     expected_xml
 351     actual_xml
 352   where
 353     input_html = "<body><a href=\"#\">Hello, world!</a></body>"
 354     input_xml  = parseHtml input_html
 355     expected_html = "<body><span>Hello, world!</span></body>"
 356     expected_xml'' = parseHtml expected_html
 357
 358 page_tests :: Test
 359 page_tests =
 360   testGroup "Page Tests" [
 361     testCase "Links are replaced with spans" test_preprocess_links ]