src/LWN/ArticlePage.hs

   1 module LWN.ArticlePage
   2 where
   3
   4 import Data.String.Utils (strip)
   5 import Data.Maybe (fromJust)
   6 import Data.Tree.NTree.TypeDefs (NTree)
   7 import Text.XML.HXT.Core (
   8   IOSArrow,
   9   XmlTree,
  10   XNode,
  11   (>>>),
  12   (/>),
  13   getText,
  14   runX,
  15   xshow
  16   )
  17 import Text.HandsomeSoup (css)
  18
  19 import Epublishable
  20 import LWN.Article
  21 import XHTML
  22
  23 -- | Defines the ArticlePage data type, containing one 'Article'.
  24 data ArticlePage = ArticlePage { article :: Article }
  25
  26
  27 instance XHTML ArticlePage where
  28   to_xhtml (ArticlePage a) =
  29     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  30     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  31     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  32     "\n<head>\n" ++
  33     "  <meta http-equiv=\"Content-Type\"" ++
  34     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  35     "  <title>" ++ (LWN.Article.title a) ++ "</title>\n" ++
  36     "</head>\n" ++
  37     "<body>\n" ++
  38     "<div>\n\n" ++
  39     (to_xhtml a) ++
  40     "\n\n</div>\n" ++
  41     "\n</body>\n" ++
  42     "</html>"
  43
  44
  45 instance Epublishable ArticlePage where
  46   parse xml = do
  47     articles <- parse_articles xml
  48     case articles of
  49       Just [x] -> return $ Just $ ArticlePage x
  50       _   -> return Nothing
  51
  52   title (ArticlePage x) = LWN.Article.title x
  53
  54
  55 -- | Takes data from an LWN page and determines whether or not it's a
  56 --   single article (as opposed to a page with multiple articles).
  57 is_article_page :: String -> IO Bool
  58 is_article_page _ = return True
  59
  60
  61 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  62 parse_headline xml = do
  63   let element_filter = xml >>> css "div.PageHeadline h1"
  64   let element_text_filter = element_filter /> getText
  65   element_text <- runX element_text_filter
  66   return $ case element_text of
  67             [x] -> Just $ strip x
  68             []  -> Nothing
  69             _   -> error "Found more than one headline."
  70
  71 parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  72 parse_byline xml = do
  73   let element_filter = xml >>> css "div.Byline"
  74   let element_text_filter = element_filter /> getText
  75   element_text <- runX element_text_filter
  76   return $ case element_text of
  77             [x] -> Just $ strip x
  78             []  -> Nothing
  79             _   -> error "Found more than one byline."
  80
  81
  82 parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  83 parse_body xml = do
  84   let element_filter = xml >>> css "div.ArticleText"
  85   let element_html_filter = xshow element_filter
  86   element_html <- runX element_html_filter
  87   return $ case element_html of
  88             [x] -> Just x
  89             []  -> Nothing
  90             _   -> error "Found more than one article."
  91
  92
  93 parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
  94 parse_articles xml = do
  95   parsed_headline <- parse_headline xml
  96   parsed_byline   <- parse_byline xml
  97   parsed_body     <- parse_body xml
  98   let title'   = fromJust parsed_headline
  99   let byline' = fromJust parsed_byline
 100   let body'   = fromJust parsed_body
 101   return $ Just $ [Article title' byline' body']