Use cmdargs to parse the one command-line argument.

[dead/lwn-epub.git] / src / LWN / FullPage.hs
diff --git a/src/LWN/FullPage.hs b/src/LWN/FullPage.hs

index 9ff782c89bc0b7e60cda43a7d47379b97470083b..1ba7910cf329d414a6b8e6d6ab10dbceee239050 100644 (file)
--- a/src/LWN/FullPage.hs
+++ b/src/LWN/FullPage.hs
@@ -1,6 +1,29 @@
  module LWN.FullPage
  where
  
  module LWN.FullPage
  where
  
+import Data.String.Utils (split, strip)
+import Data.Maybe (catMaybes, fromJust)
+import Data.Tree.NTree.TypeDefs (NTree)
+import Text.XML.HXT.Core (
+  ArrowXml,
+  IOSArrow,
+  XmlTree,
+  XNode,
+  (>>>),
+  (/>),
+  (//>),
+  getChildren,
+  getText,
+  hasAttrValue,
+  hasName,
+  none,
+  processTopDown,
+  runX,
+  xshow,
+  when
+  )
+import Text.HandsomeSoup (css, parseHtml)
+
  import Epublishable
  import LWN.Article
  import XHTML
  import Epublishable
  import LWN.Article
  import XHTML
@@ -8,16 +31,180 @@ import XHTML
  -- | An LWN page with more than one article on it. These require
  --   different parsing and display functions than the single-article
  --   pages.
  -- | An LWN page with more than one article on it. These require
  --   different parsing and display functions than the single-article
  --   pages.
-data FullPage = FullPage { articles :: [Article] }
+data FullPage = FullPage { headline :: String,
+                           articles :: [Article] }
+
+articles_xhtml :: FullPage -> String
+articles_xhtml fp = concatMap to_xhtml (articles fp)
  
  instance XHTML FullPage where
  
  instance XHTML FullPage where
-  to_xhtml fp = ""
+  to_xhtml fp =
+    "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
+    "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
+    "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
+    "<head>" ++
+    "  <meta http-equiv=\"Content-Type\"" ++
+    " content=\"application/xhtml+xml; charset=utf-8\" />" ++
+    "  <title>" ++ (headline fp) ++ "</title>" ++
+    "</head>" ++
+    "<body>" ++
+    "<div>" ++
+    "<h1>" ++ (headline fp) ++ "</h1>" ++
+    (articles_xhtml fp) ++
+    "</div>" ++
+    "</body>" ++
+    "</html>"
  
  instance Epublishable FullPage where
    parse xml = do
  
  instance Epublishable FullPage where
    parse xml = do
-    articles <- parse_articles xml
-    case articles of
-      (x:xs)@all -> return $ Just $ FullPage all
+    hl <- parse_headline xml
+    parsed_articles <- parse_articles xml
+    case parsed_articles of
+      them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
        _          -> return Nothing
  
        _          -> return Nothing
  
-  title _ = "LWN.net"
+  title = headline
+
+
+parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_headline xml = do
+  let element_filter = xml >>> css "div.PageHeadline h1"
+  let element_text_filter = element_filter /> getText
+  element_text <- runX element_text_filter
+  return $ case element_text of
+            [x] -> Just $ strip x
+            []  -> Nothing
+            _   -> error "Found more than one headline."
+
+parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_article_byline xml = do
+  let element_filter = xml >>> css "div.FeatureByLine"
+  let element_text_filter = element_filter /> getText
+  element_text <- runX element_text_filter
+  return $ case element_text of
+            [x] -> Just $ strip x
+            []  -> Nothing
+            _   -> error "Found more than one article byline."
+
+
+parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_article_title xml = do
+  let element_filter = xml >>> css "h2.SummaryHL"
+  let element_text_filter = element_filter //> getText
+  element_text <- runX element_text_filter
+  return $ case element_text of
+            [x] -> Just $ strip x
+            []  -> Nothing
+            _   -> error "Found more than one article title."
+
+
+
+is_title :: (ArrowXml a) => a XmlTree XmlTree
+is_title =
+  (hasName "h2")
+  >>>
+  (hasAttrValue "class" (== "SummaryHL"))
+
+
+is_byline :: (ArrowXml a) => a XmlTree XmlTree
+is_byline =
+  (hasName "div")
+  >>>
+  (hasAttrValue "class" (== "FeatureByLine"))
+
+
+is_image :: (ArrowXml a) => a XmlTree XmlTree
+is_image =
+  hasName "img"
+
+
+remove_title :: (ArrowXml a) => a XmlTree XmlTree
+remove_title =
+  processTopDown ((none) `when` is_title)
+
+
+remove_byline :: (ArrowXml a) => a XmlTree XmlTree
+remove_byline =
+  processTopDown ((none) `when` is_byline)
+
+
+remove_images :: (ArrowXml a) => a XmlTree XmlTree
+remove_images =
+  processTopDown ((none) `when` is_image)
+
+
+
+parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_article_body xml = do
+  -- First, delete the article title and byline.
+  let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images
+  -- The only child of the body element should be a div.lwn-article
+  -- since we wrapped the article's HTML in that.
+  let clean_xml = clean_xml' >>> css "body" >>> getChildren
+  clean_html <- runX . xshow $ clean_xml
+  return $ case clean_html of
+            [x] -> Just x
+            []  -> Nothing
+            _   -> error "Found more than one article body."
+
+parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
+parse_article xml = do
+  parsed_article_title    <- parse_article_title xml
+  parsed_article_byline   <- parse_article_byline xml
+  parsed_article_body     <- parse_article_body xml
+  let title'   = Title    $ fromJust parsed_article_title
+  let byline'  = Byline     parsed_article_byline
+  let body'    = BodyHtml $ fromJust parsed_article_body
+  return $ Just $ Article title' byline' body'
+
+parse_html_article :: String -> IO (Maybe Article)
+parse_html_article html = do
+  let xml = parseHtml $ wrap_in_body_div html
+  parse_article xml
+
+  
+-- | In the full page, all of the article titles and bodies are
+--   wrapped in a div.ArticleText.
+parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
+parse_bodies xml =
+  xml >>> css "div.ArticleText"
+
+
+-- Debug, print a string.
+print_article :: String -> IO ()
+print_article s = do
+  putStrLn "-----------"
+  putStrLn "- Article -"
+  putStrLn "-----------"
+  putStrLn ""
+  putStrLn s
+  putStrLn ""
+
+
+-- Debug, print an article's body html.
+print_body :: Article -> IO ()
+print_body x =
+  print_article bh
+  where
+    bh' = body_html x
+    bh  = getBodyHtml bh'
+
+
+parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
+parse_articles xml = do
+  bodies <- runX . xshow $ parse_bodies xml
+  let article_separator = "<h2 class=\"SummaryHL\">"
+  let split_articles'' = split article_separator (concat bodies)
+  -- The first element will contain the crap before the first <h2...>.
+  let split_articles' = tail split_articles''
+  -- Put the separator back, it was lost during the split.
+  let split_articles = map (article_separator ++) split_articles'
+  --_ <- mapM print_article split_articles
+  real_articles <- mapM parse_html_article split_articles
+  let just_articles = catMaybes real_articles
+  return just_articles
+
+
+wrap_in_body_div :: String -> String
+wrap_in_body_div s =
+  "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"