src/LWN/FullPage.hs

   1 module LWN.FullPage
   2 where
   3
   4 import Data.String.Utils (split, strip)
   5 import Data.Maybe (catMaybes, fromJust)
   6 import Data.Tree.NTree.TypeDefs (NTree)
   7 import Text.XML.HXT.Core (
   8   IOSArrow,
   9   XmlTree,
  10   XNode,
  11   (>>>),
  12   (/>),
  13   getText,
  14   hasAttrValue,
  15   hasName,
  16   none,
  17   processTopDown,
  18   runX,
  19   xshow,
  20   when
  21   )
  22 import Text.XML.HXT.Arrow.Edit (indentDoc)
  23 import Text.HandsomeSoup (css, parseHtml)
  24
  25 import Epublishable
  26 import LWN.Article
  27 import XHTML
  28
  29 -- | An LWN page with more than one article on it. These require
  30 --   different parsing and display functions than the single-article
  31 --   pages.
  32 data FullPage = FullPage { headline :: String,
  33                            articles :: [Article] }
  34
  35 articles_xhtml :: FullPage -> String
  36 articles_xhtml fp = concatMap to_xhtml (articles fp)
  37
  38 instance XHTML FullPage where
  39   to_xhtml fp =
  40     "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
  41     "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
  42     "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
  43     "\n<head>\n" ++
  44     "  <meta http-equiv=\"Content-Type\"" ++
  45     " content=\"application/xhtml+xml; charset=utf-8\" />" ++
  46     "  <title>" ++ (headline fp) ++ "</title>\n" ++
  47     "</head>\n" ++
  48     "<body>\n" ++
  49     "<div>\n\n" ++
  50     "<h1>" ++ (headline fp) ++ "</h1>" ++
  51     (articles_xhtml fp) ++
  52     "\n\n</div>\n" ++
  53     "\n</body>\n" ++
  54     "</html>"
  55
  56 instance Epublishable FullPage where
  57   parse xml = do
  58     hl <- parse_headline xml
  59     parsed_articles <- parse_articles xml
  60     case parsed_articles of
  61       them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
  62       _          -> return Nothing
  63
  64   title _ = "LWN.net"
  65
  66
  67 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  68 parse_headline xml = do
  69   let element_filter = xml >>> css "div.PageHeadline h1"
  70   let element_text_filter = element_filter /> getText
  71   element_text <- runX element_text_filter
  72   return $ case element_text of
  73             [x] -> Just $ strip x
  74             []  -> Nothing
  75             _   -> error "Found more than one headline."
  76
  77 parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  78 parse_article_byline xml = do
  79   let element_filter = xml >>> css "div.FeatureByLine"
  80   let element_text_filter = element_filter /> getText
  81   element_text <- runX element_text_filter
  82   return $ case element_text of
  83             [x] -> Just $ strip x
  84             []  -> Nothing
  85             _   -> error "Found more than one article byline."
  86
  87
  88 parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  89 parse_article_title xml = do
  90   let element_filter = xml >>> css "h2.SummaryHL a"
  91   let element_text_filter = element_filter /> getText
  92   element_text <- runX element_text_filter
  93   return $ case element_text of
  94             [x] -> Just $ strip x
  95             []  -> Nothing
  96             _   -> error "Found more than one article title."
  97
  98
  99 --is_title :: Integer
 100 --is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL"))
 101
 102 --is_byline :: Integer
 103 --is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine"))
 104
 105 parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
 106 parse_article_body xml = do
 107   -- First, delete the article title and byline.
 108   --let clean_xml = xml >>> processTopDown ((none) `when` is_title) >>> processTopDown ((none) `when` is_byline)
 109   let clean_xml = xml
 110   clean_html <- runX $ xshow clean_xml
 111   return $ case clean_html of
 112             [x] -> Just x
 113             []  -> Nothing
 114             _   -> error "Found more than one article body."
 115
 116 parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
 117 parse_article xml = do
 118   parsed_article_title    <- parse_article_title xml
 119   parsed_article_byline   <- parse_article_byline xml
 120   parsed_article_body     <- parse_article_body xml
 121 --  let title'   = fromJust parsed_article_title
 122   let title' = "title"
 123 --  let byline' = fromJust parsed_article_byline
 124   let byline' = "byline"
 125 --  let body'   = fromJust parsed_article_body
 126   body' <- runX . xshow $ xml
 127   return $ Just $ Article title' byline' (body' !! 0)
 128
 129 parse_html_article :: String -> IO (Maybe Article)
 130 parse_html_article html = do
 131   let xml = parseHtml html
 132   parse_article xml
 133
 134
 135 -- | In the full page, all of the article titles and bodies are
 136 --   wrapped in a div.ArticleText.
 137 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
 138 parse_bodies xml =
 139   xml >>> css "div.ArticleText"
 140
 141
 142
 143 parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
 144 parse_articles xml = do
 145   bodies <- runX . xshow $ (parse_bodies xml) >>> indentDoc
 146   let article_separator = "<h2 class=\"SummaryHL\">"
 147   let split_articles'' = split article_separator (bodies !! 0)
 148   -- The first element will contain the crap before the first <h2...>.
 149   let split_articles' = tail split_articles''
 150   -- Put the separator back, it was lost during the split.
 151   let split_articles = map (("\n" ++ article_separator) ++) split_articles'
 152   putStrLn "split articles\n\n"
 153   mapM putStrLn split_articles
 154   real_articles <- mapM parse_html_article split_articles
 155   let just_articles = catMaybes real_articles
 156   return just_articles
 157
 158