Go through a good bit of nonsense to get it successfully parsing our three test cases.

author Michael Orlitzky <michael@orlitzky.com>

Sun, 24 Jun 2012 03:47:24 +0000 (23:47 -0400)

committer Michael Orlitzky <michael@orlitzky.com>

Sun, 24 Jun 2012 03:47:24 +0000 (23:47 -0400)
author Michael Orlitzky <michael@orlitzky.com>
Sun, 24 Jun 2012 03:47:24 +0000 (23:47 -0400)
committer Michael Orlitzky <michael@orlitzky.com>
Sun, 24 Jun 2012 03:47:24 +0000 (23:47 -0400)
diff --git a/src/LWN/Article.hs b/src/LWN/Article.hs

index 2820d54f93393a94fbd59b6928405c34f9598af1..70c68a5190d724fc2663531c57a63d02efb7c227 100644 (file)
--- a/src/LWN/Article.hs
+++ b/src/LWN/Article.hs
@@ -3,12 +3,36 @@ where
  
  import XHTML
  
  
  import XHTML
  
-data Article = Article { title     :: String,
-                         byline    :: String,
-                         body_html :: String }
+newtype Title    = Title    { getTitle    :: String }
+newtype Byline   = Byline   { getByline   :: Maybe String }
+newtype BodyHtml = BodyHtml { getBodyHtml :: String }
+
+instance Show Title where
+  show = getTitle
+
+instance Show Byline where
+  show (Byline (Just bl)) = bl
+  show (Byline Nothing  ) = ""
+
+instance Show BodyHtml where
+  show = getBodyHtml
+
+instance XHTML Title where
+  to_xhtml (Title t) = "<h2>" ++ t ++ "</h2>"
+
+instance XHTML Byline where
+  to_xhtml (Byline (Just bl)) = "<p><em>" ++ bl ++ "</em></p>"
+  to_xhtml (Byline Nothing)   = ""
+
+instance XHTML BodyHtml where
+  to_xhtml = getBodyHtml
+
+data Article = Article { title     :: Title,
+                         byline    :: Byline,
+                         body_html :: BodyHtml }
  
  instance XHTML Article where
    to_xhtml (Article t bl b) =
  
  instance XHTML Article where
    to_xhtml (Article t bl b) =
-    "<h2>" ++ t ++ "</h2>\n\n" ++
-    "<p><em>" ++ bl ++ "</em></p>\n\n" ++
-    b
+    (to_xhtml t)  ++
+    (to_xhtml bl) ++
+    (to_xhtml b)
diff --git a/src/LWN/ArticlePage.hs b/src/LWN/ArticlePage.hs

index 927e241d91eba76b57f586133087db652738df4f..5964013f418303db52ce1ebe7dfdea123cdfe70c 100644 (file)
--- a/src/LWN/ArticlePage.hs
+++ b/src/LWN/ArticlePage.hs
@@ -29,16 +29,16 @@ instance XHTML ArticlePage where
      "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
      "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
      "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
      "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
      "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
      "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
-    "\n<head>\n" ++
+    "<head>" ++
      "  <meta http-equiv=\"Content-Type\"" ++
      " content=\"application/xhtml+xml; charset=utf-8\" />" ++
      "  <meta http-equiv=\"Content-Type\"" ++
      " content=\"application/xhtml+xml; charset=utf-8\" />" ++
-    "  <title>" ++ (LWN.Article.title a) ++ "</title>\n" ++
-    "</head>\n" ++
-    "<body>\n" ++
-    "<div>\n\n" ++
+    "  <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
+    "</head>" ++
+    "<body>" ++
+    "<div>" ++
      (to_xhtml a) ++
      (to_xhtml a) ++
-    "\n\n</div>\n" ++
-    "\n</body>\n" ++
+    "</div>" ++
+    "</body>" ++
      "</html>"
  
  
      "</html>"
  
  
@@ -49,7 +49,7 @@ instance Epublishable ArticlePage where
        Just [x] -> return $ Just $ ArticlePage x
        _   -> return Nothing
  
        Just [x] -> return $ Just $ ArticlePage x
        _   -> return Nothing
  
-  title (ArticlePage x) = LWN.Article.title x
+  title (ArticlePage x) = show $ LWN.Article.title x
  
  
  -- | Takes data from an LWN page and determines whether or not it's a
  
  
  -- | Takes data from an LWN page and determines whether or not it's a
@@ -95,7 +95,7 @@ parse_articles xml = do
    parsed_headline <- parse_headline xml
    parsed_byline   <- parse_byline xml
    parsed_body     <- parse_body xml
    parsed_headline <- parse_headline xml
    parsed_byline   <- parse_byline xml
    parsed_body     <- parse_body xml
-  let title'   = fromJust parsed_headline
-  let byline' = fromJust parsed_byline
-  let body'   = fromJust parsed_body
+  let title'   = Title (fromJust parsed_headline)
+  let byline' = Byline  parsed_byline
+  let body'   = BodyHtml (fromJust parsed_body)
    return $ Just $ [Article title' byline' body']
    return $ Just $ [Article title' byline' body']
diff --git a/src/LWN/FullPage.hs b/src/LWN/FullPage.hs

index ac8886279547e926de7e91fd12a84a33af9a3bc0..1ba7910cf329d414a6b8e6d6ab10dbceee239050 100644 (file)
--- a/src/LWN/FullPage.hs
+++ b/src/LWN/FullPage.hs
@@ -5,11 +5,14 @@ import Data.String.Utils (split, strip)
  import Data.Maybe (catMaybes, fromJust)
  import Data.Tree.NTree.TypeDefs (NTree)
  import Text.XML.HXT.Core (
  import Data.Maybe (catMaybes, fromJust)
  import Data.Tree.NTree.TypeDefs (NTree)
  import Text.XML.HXT.Core (
+  ArrowXml,
    IOSArrow,
    XmlTree,
    XNode,
    (>>>),
    (/>),
    IOSArrow,
    XmlTree,
    XNode,
    (>>>),
    (/>),
+  (//>),
+  getChildren,
    getText,
    hasAttrValue,
    hasName,
    getText,
    hasAttrValue,
    hasName,
@@ -19,7 +22,6 @@ import Text.XML.HXT.Core (
    xshow,
    when
    )
    xshow,
    when
    )
-import Text.XML.HXT.Arrow.Edit (indentDoc)
  import Text.HandsomeSoup (css, parseHtml)
  
  import Epublishable
  import Text.HandsomeSoup (css, parseHtml)
  
  import Epublishable
@@ -40,17 +42,17 @@ instance XHTML FullPage where
      "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
      "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
      "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
      "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
      "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
      "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
-    "\n<head>\n" ++
+    "<head>" ++
      "  <meta http-equiv=\"Content-Type\"" ++
      " content=\"application/xhtml+xml; charset=utf-8\" />" ++
      "  <meta http-equiv=\"Content-Type\"" ++
      " content=\"application/xhtml+xml; charset=utf-8\" />" ++
-    "  <title>" ++ (headline fp) ++ "</title>\n" ++
-    "</head>\n" ++
-    "<body>\n" ++
-    "<div>\n\n" ++
+    "  <title>" ++ (headline fp) ++ "</title>" ++
+    "</head>" ++
+    "<body>" ++
+    "<div>" ++
      "<h1>" ++ (headline fp) ++ "</h1>" ++
      (articles_xhtml fp) ++
      "<h1>" ++ (headline fp) ++ "</h1>" ++
      (articles_xhtml fp) ++
-    "\n\n</div>\n" ++
-    "\n</body>\n" ++
+    "</div>" ++
+    "</body>" ++
      "</html>"
  
  instance Epublishable FullPage where
      "</html>"
  
  instance Epublishable FullPage where
@@ -61,7 +63,7 @@ instance Epublishable FullPage where
        them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
        _          -> return Nothing
  
        them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
        _          -> return Nothing
  
-  title _ = "LWN.net"
+  title = headline
  
  
  parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  
  
  parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
@@ -87,8 +89,8 @@ parse_article_byline xml = do
  
  parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  parse_article_title xml = do
  
  parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  parse_article_title xml = do
-  let element_filter = xml >>> css "h2.SummaryHL a"
-  let element_text_filter = element_filter /> getText
+  let element_filter = xml >>> css "h2.SummaryHL"
+  let element_text_filter = element_filter //> getText
    element_text <- runX element_text_filter
    return $ case element_text of
              [x] -> Just $ strip x
    element_text <- runX element_text_filter
    return $ case element_text of
              [x] -> Just $ strip x
@@ -96,18 +98,50 @@ parse_article_title xml = do
              _   -> error "Found more than one article title."
  
  
              _   -> error "Found more than one article title."
  
  
---is_title :: Integer
---is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL"))
  
  
---is_byline :: Integer
---is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine"))
+is_title :: (ArrowXml a) => a XmlTree XmlTree
+is_title =
+  (hasName "h2")
+  >>>
+  (hasAttrValue "class" (== "SummaryHL"))
+
+
+is_byline :: (ArrowXml a) => a XmlTree XmlTree
+is_byline =
+  (hasName "div")
+  >>>
+  (hasAttrValue "class" (== "FeatureByLine"))
+
+
+is_image :: (ArrowXml a) => a XmlTree XmlTree
+is_image =
+  hasName "img"
+
+
+remove_title :: (ArrowXml a) => a XmlTree XmlTree
+remove_title =
+  processTopDown ((none) `when` is_title)
+
+
+remove_byline :: (ArrowXml a) => a XmlTree XmlTree
+remove_byline =
+  processTopDown ((none) `when` is_byline)
+
+
+remove_images :: (ArrowXml a) => a XmlTree XmlTree
+remove_images =
+  processTopDown ((none) `when` is_image)
+
+
  
  parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  parse_article_body xml = do
    -- First, delete the article title and byline.
  
  parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  parse_article_body xml = do
    -- First, delete the article title and byline.
-  --let clean_xml = xml >>> processTopDown ((none) `when` is_title) >>> processTopDown ((none) `when` is_byline)
-  let clean_xml = xml
-  clean_html <- runX $ xshow clean_xml
+  let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images
+  -- The only child of the body element should be a div.lwn-article
+  -- since we wrapped the article's HTML in that.
+  let clean_xml = clean_xml' >>> css "body" >>> getChildren
+  clean_html <- runX . xshow $ clean_xml
    return $ case clean_html of
              [x] -> Just x
              []  -> Nothing
    return $ case clean_html of
              [x] -> Just x
              []  -> Nothing
@@ -118,17 +152,14 @@ parse_article xml = do
    parsed_article_title    <- parse_article_title xml
    parsed_article_byline   <- parse_article_byline xml
    parsed_article_body     <- parse_article_body xml
    parsed_article_title    <- parse_article_title xml
    parsed_article_byline   <- parse_article_byline xml
    parsed_article_body     <- parse_article_body xml
---  let title'   = fromJust parsed_article_title
-  let title' = "title"
---  let byline' = fromJust parsed_article_byline
-  let byline' = "byline"
---  let body'   = fromJust parsed_article_body
-  body' <- runX . xshow $ xml
-  return $ Just $ Article title' byline' (body' !! 0)
+  let title'   = Title    $ fromJust parsed_article_title
+  let byline'  = Byline     parsed_article_byline
+  let body'    = BodyHtml $ fromJust parsed_article_body
+  return $ Just $ Article title' byline' body'
  
  parse_html_article :: String -> IO (Maybe Article)
  parse_html_article html = do
  
  parse_html_article :: String -> IO (Maybe Article)
  parse_html_article html = do
-  let xml = parseHtml html
+  let xml = parseHtml $ wrap_in_body_div html
    parse_article xml
  
    
    parse_article xml
  
    
@@ -139,20 +170,41 @@ parse_bodies xml =
    xml >>> css "div.ArticleText"
  
  
    xml >>> css "div.ArticleText"
  
  
+-- Debug, print a string.
+print_article :: String -> IO ()
+print_article s = do
+  putStrLn "-----------"
+  putStrLn "- Article -"
+  putStrLn "-----------"
+  putStrLn ""
+  putStrLn s
+  putStrLn ""
+
+
+-- Debug, print an article's body html.
+print_body :: Article -> IO ()
+print_body x =
+  print_article bh
+  where
+    bh' = body_html x
+    bh  = getBodyHtml bh'
+
  
  parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
  parse_articles xml = do
  
  parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
  parse_articles xml = do
-  bodies <- runX . xshow $ (parse_bodies xml) >>> indentDoc
+  bodies <- runX . xshow $ parse_bodies xml
    let article_separator = "<h2 class=\"SummaryHL\">"
    let article_separator = "<h2 class=\"SummaryHL\">"
-  let split_articles'' = split article_separator (bodies !! 0)
+  let split_articles'' = split article_separator (concat bodies)
    -- The first element will contain the crap before the first <h2...>.
    let split_articles' = tail split_articles''
    -- Put the separator back, it was lost during the split.
    -- The first element will contain the crap before the first <h2...>.
    let split_articles' = tail split_articles''
    -- Put the separator back, it was lost during the split.
-  let split_articles = map (("\n" ++ article_separator) ++) split_articles'
-  putStrLn "split articles\n\n"
-  mapM putStrLn split_articles
+  let split_articles = map (article_separator ++) split_articles'
+  --_ <- mapM print_article split_articles
    real_articles <- mapM parse_html_article split_articles
    let just_articles = catMaybes real_articles
    return just_articles
  
  
    real_articles <- mapM parse_html_article split_articles
    let just_articles = catMaybes real_articles
    return just_articles
  
  
+wrap_in_body_div :: String -> String
+wrap_in_body_div s =
+  "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
diff --git a/src/Main.hs b/src/Main.hs

index 37f89674eacbc115050f65f717a6bbe8530ddf71..c6ab443a9620d20f716576eb0c188702f8ad4b5d 100644 (file)
--- a/src/Main.hs
+++ b/src/Main.hs
@@ -3,23 +3,34 @@ module Main
  where
  
  import Data.Maybe (fromJust)
  where
  
  import Data.Maybe (fromJust)
-import Text.HandsomeSoup (parseHtml)
+import Text.XML.HXT.Core -- (SysConfigList, IOStateArrow, XmlTree, readDocument)
  
  import Epublishable
  import LWN.ArticlePage
  import LWN.FullPage
  
  import Epublishable
  import LWN.ArticlePage
  import LWN.FullPage
-import XHTML
+
+my_read :: String -> IOStateArrow s b XmlTree
+my_read =
+  readDocument [ withValidate  no,
+                 withParseHTML yes,
+                 withInputEncoding utf8,
+                 withWarnings no ]
  
  main :: IO ()
  main = do
  
  main :: IO ()
  main = do
-  article_html <- readFile "test/fixtures/501317-article.html"
-  ioap <- parse $ parseHtml article_html
+  let article_html = my_read "test/fixtures/501317-article.html"
+  ioap <- parse article_html
    let article_page :: ArticlePage = fromJust $ ioap
    epublish article_page "single_article.epub"
  
    let article_page :: ArticlePage = fromJust $ ioap
    epublish article_page "single_article.epub"
  
-  page_html <- readFile "test/fixtures/500848-page.html"
-  ioap_f <- parse $ parseHtml page_html
+  let page_html = my_read "test/fixtures/500848-page.html"
+  ioap_f <- parse page_html
    let full_page :: FullPage = fromJust $ ioap_f
    let full_page :: FullPage = fromJust $ ioap_f
-  --putStrLn $ to_xhtml full_page
-  --epublish full_page "full_page.epub"
+  epublish full_page "full_page.epub"
+
+  let bigpage_html = my_read "test/fixtures/50844-bigpage.html"
+  ioap_bp <- parse bigpage_html
+  let bigpage :: FullPage = fromJust $ ioap_bp
+  epublish bigpage "bigpage.epub"
+
    putStrLn "Done."
    putStrLn "Done."
author	Michael Orlitzky <michael@orlitzky.com>
	Sun, 24 Jun 2012 03:47:24 +0000 (23:47 -0400)
committer	Michael Orlitzky <michael@orlitzky.com>
	Sun, 24 Jun 2012 03:47:24 +0000 (23:47 -0400)
src/LWN/Article.hs		patch \| blob \| history
src/LWN/ArticlePage.hs		patch \| blob \| history
src/LWN/FullPage.hs		patch \| blob \| history
src/Main.hs		patch \| blob \| history