src/LWN/Page.hs

   1 module LWN.Page
   2 where
   3
   4 import Text.Pandoc
   5 import Data.Time (getCurrentTime)
   6 import System.IO (Handle)
   7 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
   8 import Data.String.Utils (split, strip)
   9 import Data.Maybe (catMaybes, fromJust, isNothing)
  10 import Data.Tree.NTree.TypeDefs (NTree)
  11 import Text.XML.HXT.Core (
  12   ArrowXml,
  13   IOSArrow,
  14   XmlTree,
  15   XNode,
  16   (>>>),
  17   (/>),
  18   (//>),
  19   getChildren,
  20   getText,
  21   hasAttrValue,
  22   hasName,
  23   none,
  24   processTopDown,
  25   runX,
  26   xshow,
  27   when
  28   )
  29 import Text.HandsomeSoup (css, parseHtml)
  30
  31 import LWN.Article
  32 import XHTML
  33
  34 data Page =
  35   -- | An LWN page with one article on it.
  36   ArticlePage { article :: Article } |
  37
  38   -- | An LWN page with more than one article on it. These require
  39   --   different parsing and display functions than the single-article
  40   --   pages.
  41   FullPage { headline :: String,
  42              articles :: [Article] }
  43
  44
  45 instance XHTML Page where
  46   to_xhtml (ArticlePage a) =
  47     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  48     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  49     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  50     "<head>" ++
  51     "  <meta http-equiv=\"Content-Type\"" ++
  52     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  53     "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
  54     "</head>" ++
  55     "<body>" ++
  56     "<div>" ++
  57     (to_xhtml a) ++
  58     "</div>" ++
  59     "</body>" ++
  60     "</html>"
  61
  62   to_xhtml (FullPage hl as) =
  63     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  64     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  65     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  66     "<head>" ++
  67     "  <meta http-equiv=\"Content-Type\"" ++
  68     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  69     "  <title>" ++ hl ++ "</title>" ++
  70     "</head>" ++
  71     "<body>" ++
  72     "<div>" ++
  73     "<h1>" ++ hl ++ "</h1>" ++
  74     (concatMap to_xhtml as) ++
  75     "</div>" ++
  76     "</body>" ++
  77     "</html>"
  78
  79
  80
  81 remove_images :: (ArrowXml a) => a XmlTree XmlTree
  82 remove_images =
  83   processTopDown ((none) `when` is_image)
  84
  85
  86 -- | Preprocessing common to both page types.
  87 preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
  88 preprocess xml =
  89   xml >>>remove_images
  90
  91
  92 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
  93 parse xml = do
  94   let clean_xml = preprocess xml
  95   appr <- ap_parse clean_xml
  96   fppr <- fp_parse clean_xml
  97   return $
  98     if (isNothing appr) then
  99       fppr
 100     else
 101       appr
 102
 103 --
 104 -- ArticlePage Stuff
 105 --
 106 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
 107 ap_parse xml = do
 108     arts <- ap_parse_articles xml
 109     case arts of
 110       Just [x] -> return $ Just $ ArticlePage x
 111       _   -> return Nothing
 112
 113
 114 ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 115 ap_parse_headline xml = do
 116   let element_filter = xml >>> css "div.PageHeadline h1"
 117   let element_text_filter = element_filter /> getText
 118   element_text <- runX element_text_filter
 119   return $ case element_text of
 120             [x] -> Just $ strip x
 121             []  -> Nothing
 122             _   -> error "Found more than one headline."
 123
 124 ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 125 ap_parse_byline xml = do
 126   let element_filter = xml >>> css "div.Byline"
 127   let element_text_filter = element_filter /> getText
 128   element_text <- runX element_text_filter
 129   return $ case element_text of
 130             [x] -> Just $ strip x
 131             []  -> Nothing
 132             _   -> error "Found more than one byline."
 133
 134
 135 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 136 ap_parse_body xml = do
 137   let element_filter = xml >>> css "div.ArticleText"
 138   let element_html_filter = xshow element_filter
 139   element_html <- runX element_html_filter
 140   return $ case element_html of
 141             [x] -> Just x
 142             []  -> Nothing
 143             _   -> error "Found more than one article."
 144
 145
 146 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
 147 ap_parse_articles xml = do
 148   parsed_headline <- ap_parse_headline xml
 149   parsed_byline   <- ap_parse_byline xml
 150   parsed_body     <- ap_parse_body xml
 151   let title'   = Title (fromJust parsed_headline)
 152   let byline' = Byline  parsed_byline
 153   let body'   = BodyHtml (fromJust parsed_body)
 154   return $ Just $ [Article title' byline' body']
 155
 156
 157
 158
 159 --
 160 -- FullPage Stuff
 161 --
 162
 163 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
 164 fp_parse xml = do
 165     hl <- fp_parse_headline xml
 166     parsed_articles <- fp_parse_articles xml
 167     case parsed_articles of
 168       them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
 169       _          -> return Nothing
 170
 171
 172
 173
 174 fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 175 fp_parse_headline xml = do
 176   let element_filter = xml >>> css "div.PageHeadline h1"
 177   let element_text_filter = element_filter /> getText
 178   element_text <- runX element_text_filter
 179   return $ case element_text of
 180             [x] -> Just $ strip x
 181             []  -> Nothing
 182             _   -> error "Found more than one headline."
 183
 184 fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 185 fp_parse_article_byline xml = do
 186   let element_filter = xml >>> css "div.FeatureByLine"
 187   let element_text_filter = element_filter /> getText
 188   element_text <- runX element_text_filter
 189   return $ case element_text of
 190             [x] -> Just $ strip x
 191             []  -> Nothing
 192             _   -> error "Found more than one article byline."
 193
 194
 195 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 196 fp_parse_article_title xml = do
 197   let element_filter = xml >>> css "h2.SummaryHL"
 198   let element_text_filter = element_filter //> getText
 199   element_text <- runX element_text_filter
 200   return $ case element_text of
 201             [x] -> Just $ strip x
 202             []  -> Nothing
 203             _   -> error "Found more than one article title."
 204
 205
 206
 207 is_title :: (ArrowXml a) => a XmlTree XmlTree
 208 is_title =
 209   (hasName "h2")
 210   >>>
 211   (hasAttrValue "class" (== "SummaryHL"))
 212
 213
 214 is_byline :: (ArrowXml a) => a XmlTree XmlTree
 215 is_byline =
 216   (hasName "div")
 217   >>>
 218   (hasAttrValue "class" (== "FeatureByLine"))
 219
 220
 221 is_image :: (ArrowXml a) => a XmlTree XmlTree
 222 is_image =
 223   hasName "img"
 224
 225
 226 remove_title :: (ArrowXml a) => a XmlTree XmlTree
 227 remove_title =
 228   processTopDown ((none) `when` is_title)
 229
 230
 231 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
 232 remove_byline =
 233   processTopDown ((none) `when` is_byline)
 234
 235
 236
 237 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 238 fp_parse_article_body xml = do
 239   -- First, delete the article title and byline.
 240   let clean_xml' = xml >>> remove_title >>> remove_byline
 241   -- The only child of the body element should be a div.lwn-article
 242   -- since we wrapped the article's HTML in that.
 243   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 244   clean_html <- runX . xshow $ clean_xml
 245   return $ case clean_html of
 246             [x] -> Just x
 247             []  -> Nothing
 248             _   -> error "Found more than one article body."
 249
 250 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
 251 fp_parse_article xml = do
 252   parsed_article_title    <- fp_parse_article_title xml
 253   parsed_article_byline   <- fp_parse_article_byline xml
 254   parsed_article_body     <- fp_parse_article_body xml
 255   let title'   = Title    $ fromJust parsed_article_title
 256   let byline'  = Byline     parsed_article_byline
 257   let body'    = BodyHtml $ fromJust parsed_article_body
 258   return $ Just $ Article title' byline' body'
 259
 260 parse_html_article :: String -> IO (Maybe Article)
 261 parse_html_article html = do
 262   let xml = parseHtml $ wrap_in_body_div html
 263   fp_parse_article xml
 264
 265
 266 -- | In the full page, all of the article titles and bodies are
 267 --   wrapped in a div.ArticleText.
 268 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
 269 parse_bodies xml =
 270   xml >>> css "div.ArticleText"
 271
 272
 273 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
 274 fp_parse_articles xml = do
 275   bodies <- runX . xshow $ parse_bodies xml
 276   let article_separator = "<h2 class=\"SummaryHL\">"
 277   let split_articles'' = split article_separator (concat bodies)
 278   -- The first element will contain the crap before the first <h2...>.
 279   let split_articles' = tail split_articles''
 280   -- Put the separator back, it was lost during the split.
 281   let split_articles = map (article_separator ++) split_articles'
 282   --_ <- mapM print_article split_articles
 283   real_articles <- mapM parse_html_article split_articles
 284   let just_articles = catMaybes real_articles
 285   return just_articles
 286
 287
 288 wrap_in_body_div :: String -> String
 289 wrap_in_body_div s =
 290   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
 291
 292
 293
 294
 295 --
 296 -- Epublishable stuff
 297 --
 298
 299 title :: Page -> String
 300 title (ArticlePage a)  = getTitle $ LWN.Article.title a
 301 title (FullPage hl _) = hl
 302
 303
 304 metadata :: Page -> IO String
 305 metadata obj = do
 306   date <- getCurrentTime
 307   return $
 308     "<dc:creator>http://lwn.net/</dc:creator>\n" ++
 309     "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
 310     "<dc:language>en-US</dc:language>\n" ++
 311     "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
 312     "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
 313
 314
 315 epublish :: Page -> Handle -> IO ()
 316 epublish obj handle = do
 317   let xhtml = to_xhtml obj
 318   epmd <- metadata obj
 319   epub <- xhtml_to_epub epmd xhtml
 320   B.hPut handle epub
 321
 322
 323 xhtml_to_epub :: String -> String -> IO B.ByteString
 324 xhtml_to_epub epmd =
 325    write_epub . read_html
 326    where
 327      my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
 328      write_epub = writeEPUB Nothing [] my_writer_options
 329      read_html  = readHtml defaultParserState