src/LWN/FullPage.hs

   1 module LWN.FullPage
   2 where
   3
   4 import Data.String.Utils (split, strip)
   5 import Data.Maybe (catMaybes, fromJust)
   6 import Data.Tree.NTree.TypeDefs (NTree)
   7 import Text.XML.HXT.Core (
   8   ArrowXml,
   9   IOSArrow,
  10   XmlTree,
  11   XNode,
  12   (>>>),
  13   (/>),
  14   (//>),
  15   getChildren,
  16   getText,
  17   hasAttrValue,
  18   hasName,
  19   none,
  20   processTopDown,
  21   runX,
  22   xshow,
  23   when
  24   )
  25 import Text.HandsomeSoup (css, parseHtml)
  26
  27 import Epublishable
  28 import LWN.Article
  29 import XHTML
  30
  31 -- | An LWN page with more than one article on it. These require
  32 --   different parsing and display functions than the single-article
  33 --   pages.
  34 data FullPage = FullPage { headline :: String,
  35                            articles :: [Article] }
  36
  37 articles_xhtml :: FullPage -> String
  38 articles_xhtml fp = concatMap to_xhtml (articles fp)
  39
  40 instance XHTML FullPage where
  41   to_xhtml fp =
  42     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  43     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  44     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  45     "<head>" ++
  46     "  <meta http-equiv=\"Content-Type\"" ++
  47     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  48     "  <title>" ++ (headline fp) ++ "</title>" ++
  49     "</head>" ++
  50     "<body>" ++
  51     "<div>" ++
  52     "<h1>" ++ (headline fp) ++ "</h1>" ++
  53     (articles_xhtml fp) ++
  54     "</div>" ++
  55     "</body>" ++
  56     "</html>"
  57
  58 instance Epublishable FullPage where
  59   parse xml = do
  60     hl <- parse_headline xml
  61     parsed_articles <- parse_articles xml
  62     case parsed_articles of
  63       them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
  64       _          -> return Nothing
  65
  66   title = headline
  67
  68
  69 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  70 parse_headline xml = do
  71   let element_filter = xml >>> css "div.PageHeadline h1"
  72   let element_text_filter = element_filter /> getText
  73   element_text <- runX element_text_filter
  74   return $ case element_text of
  75             [x] -> Just $ strip x
  76             []  -> Nothing
  77             _   -> error "Found more than one headline."
  78
  79 parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  80 parse_article_byline xml = do
  81   let element_filter = xml >>> css "div.FeatureByLine"
  82   let element_text_filter = element_filter /> getText
  83   element_text <- runX element_text_filter
  84   return $ case element_text of
  85             [x] -> Just $ strip x
  86             []  -> Nothing
  87             _   -> error "Found more than one article byline."
  88
  89
  90 parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  91 parse_article_title xml = do
  92   let element_filter = xml >>> css "h2.SummaryHL"
  93   let element_text_filter = element_filter //> getText
  94   element_text <- runX element_text_filter
  95   return $ case element_text of
  96             [x] -> Just $ strip x
  97             []  -> Nothing
  98             _   -> error "Found more than one article title."
  99
 100
 101
 102 is_title :: (ArrowXml a) => a XmlTree XmlTree
 103 is_title =
 104   (hasName "h2")
 105   >>>
 106   (hasAttrValue "class" (== "SummaryHL"))
 107
 108
 109 is_byline :: (ArrowXml a) => a XmlTree XmlTree
 110 is_byline =
 111   (hasName "div")
 112   >>>
 113   (hasAttrValue "class" (== "FeatureByLine"))
 114
 115
 116 is_image :: (ArrowXml a) => a XmlTree XmlTree
 117 is_image =
 118   hasName "img"
 119
 120
 121 remove_title :: (ArrowXml a) => a XmlTree XmlTree
 122 remove_title =
 123   processTopDown ((none) `when` is_title)
 124
 125
 126 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
 127 remove_byline =
 128   processTopDown ((none) `when` is_byline)
 129
 130
 131 remove_images :: (ArrowXml a) => a XmlTree XmlTree
 132 remove_images =
 133   processTopDown ((none) `when` is_image)
 134
 135
 136
 137 parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 138 parse_article_body xml = do
 139   -- First, delete the article title and byline.
 140   let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images
 141   -- The only child of the body element should be a div.lwn-article
 142   -- since we wrapped the article's HTML in that.
 143   let clean_xml = clean_xml' >>> css "body" >>> getChildren
 144   clean_html <- runX . xshow $ clean_xml
 145   return $ case clean_html of
 146             [x] -> Just x
 147             []  -> Nothing
 148             _   -> error "Found more than one article body."
 149
 150 parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
 151 parse_article xml = do
 152   parsed_article_title    <- parse_article_title xml
 153   parsed_article_byline   <- parse_article_byline xml
 154   parsed_article_body     <- parse_article_body xml
 155   let title'   = Title    $ fromJust parsed_article_title
 156   let byline'  = Byline     parsed_article_byline
 157   let body'    = BodyHtml $ fromJust parsed_article_body
 158   return $ Just $ Article title' byline' body'
 159
 160 parse_html_article :: String -> IO (Maybe Article)
 161 parse_html_article html = do
 162   let xml = parseHtml $ wrap_in_body_div html
 163   parse_article xml
 164
 165
 166 -- | In the full page, all of the article titles and bodies are
 167 --   wrapped in a div.ArticleText.
 168 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
 169 parse_bodies xml =
 170   xml >>> css "div.ArticleText"
 171
 172
 173 -- Debug, print a string.
 174 print_article :: String -> IO ()
 175 print_article s = do
 176   putStrLn "-----------"
 177   putStrLn "- Article -"
 178   putStrLn "-----------"
 179   putStrLn ""
 180   putStrLn s
 181   putStrLn ""
 182
 183
 184 -- Debug, print an article's body html.
 185 print_body :: Article -> IO ()
 186 print_body x =
 187   print_article bh
 188   where
 189     bh' = body_html x
 190     bh  = getBodyHtml bh'
 191
 192
 193 parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
 194 parse_articles xml = do
 195   bodies <- runX . xshow $ parse_bodies xml
 196   let article_separator = "<h2 class=\"SummaryHL\">"
 197   let split_articles'' = split article_separator (concat bodies)
 198   -- The first element will contain the crap before the first <h2...>.
 199   let split_articles' = tail split_articles''
 200   -- Put the separator back, it was lost during the split.
 201   let split_articles = map (article_separator ++) split_articles'
 202   --_ <- mapM print_article split_articles
 203   real_articles <- mapM parse_html_article split_articles
 204   let just_articles = catMaybes real_articles
 205   return just_articles
 206
 207
 208 wrap_in_body_div :: String -> String
 209 wrap_in_body_div s =
 210   "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"