import XHTML
-data Article = Article { title :: String,
- byline :: String,
- body_html :: String }
+newtype Title = Title { getTitle :: String }
+newtype Byline = Byline { getByline :: Maybe String }
+newtype BodyHtml = BodyHtml { getBodyHtml :: String }
+
+instance Show Title where
+ show = getTitle
+
+instance Show Byline where
+ show (Byline (Just bl)) = bl
+ show (Byline Nothing ) = ""
+
+instance Show BodyHtml where
+ show = getBodyHtml
+
+instance XHTML Title where
+ to_xhtml (Title t) = "<h2>" ++ t ++ "</h2>"
+
+instance XHTML Byline where
+ to_xhtml (Byline (Just bl)) = "<p><em>" ++ bl ++ "</em></p>"
+ to_xhtml (Byline Nothing) = ""
+
+instance XHTML BodyHtml where
+ to_xhtml = getBodyHtml
+
+data Article = Article { title :: Title,
+ byline :: Byline,
+ body_html :: BodyHtml }
instance XHTML Article where
to_xhtml (Article t bl b) =
- "<h2>" ++ t ++ "</h2>\n\n" ++
- "<p><em>" ++ bl ++ "</em></p>\n\n" ++
- b
+ (to_xhtml t) ++
+ (to_xhtml bl) ++
+ (to_xhtml b)
"<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
"\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
- "\n<head>\n" ++
+ "<head>" ++
" <meta http-equiv=\"Content-Type\"" ++
" content=\"application/xhtml+xml; charset=utf-8\" />" ++
- " <title>" ++ (LWN.Article.title a) ++ "</title>\n" ++
- "</head>\n" ++
- "<body>\n" ++
- "<div>\n\n" ++
+ " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
+ "</head>" ++
+ "<body>" ++
+ "<div>" ++
(to_xhtml a) ++
- "\n\n</div>\n" ++
- "\n</body>\n" ++
+ "</div>" ++
+ "</body>" ++
"</html>"
Just [x] -> return $ Just $ ArticlePage x
_ -> return Nothing
- title (ArticlePage x) = LWN.Article.title x
+ title (ArticlePage x) = show $ LWN.Article.title x
-- | Takes data from an LWN page and determines whether or not it's a
parsed_headline <- parse_headline xml
parsed_byline <- parse_byline xml
parsed_body <- parse_body xml
- let title' = fromJust parsed_headline
- let byline' = fromJust parsed_byline
- let body' = fromJust parsed_body
+ let title' = Title (fromJust parsed_headline)
+ let byline' = Byline parsed_byline
+ let body' = BodyHtml (fromJust parsed_body)
return $ Just $ [Article title' byline' body']
import Data.Maybe (catMaybes, fromJust)
import Data.Tree.NTree.TypeDefs (NTree)
import Text.XML.HXT.Core (
+ ArrowXml,
IOSArrow,
XmlTree,
XNode,
(>>>),
(/>),
+ (//>),
+ getChildren,
getText,
hasAttrValue,
hasName,
xshow,
when
)
-import Text.XML.HXT.Arrow.Edit (indentDoc)
import Text.HandsomeSoup (css, parseHtml)
import Epublishable
"<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
"\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
- "\n<head>\n" ++
+ "<head>" ++
" <meta http-equiv=\"Content-Type\"" ++
" content=\"application/xhtml+xml; charset=utf-8\" />" ++
- " <title>" ++ (headline fp) ++ "</title>\n" ++
- "</head>\n" ++
- "<body>\n" ++
- "<div>\n\n" ++
+ " <title>" ++ (headline fp) ++ "</title>" ++
+ "</head>" ++
+ "<body>" ++
+ "<div>" ++
"<h1>" ++ (headline fp) ++ "</h1>" ++
(articles_xhtml fp) ++
- "\n\n</div>\n" ++
- "\n</body>\n" ++
+ "</div>" ++
+ "</body>" ++
"</html>"
instance Epublishable FullPage where
them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
_ -> return Nothing
- title _ = "LWN.net"
+ title = headline
parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
parse_article_title xml = do
- let element_filter = xml >>> css "h2.SummaryHL a"
- let element_text_filter = element_filter /> getText
+ let element_filter = xml >>> css "h2.SummaryHL"
+ let element_text_filter = element_filter //> getText
element_text <- runX element_text_filter
return $ case element_text of
[x] -> Just $ strip x
_ -> error "Found more than one article title."
---is_title :: Integer
---is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL"))
---is_byline :: Integer
---is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine"))
+is_title :: (ArrowXml a) => a XmlTree XmlTree
+is_title =
+ (hasName "h2")
+ >>>
+ (hasAttrValue "class" (== "SummaryHL"))
+
+
+is_byline :: (ArrowXml a) => a XmlTree XmlTree
+is_byline =
+ (hasName "div")
+ >>>
+ (hasAttrValue "class" (== "FeatureByLine"))
+
+
+is_image :: (ArrowXml a) => a XmlTree XmlTree
+is_image =
+ hasName "img"
+
+
+remove_title :: (ArrowXml a) => a XmlTree XmlTree
+remove_title =
+ processTopDown ((none) `when` is_title)
+
+
+remove_byline :: (ArrowXml a) => a XmlTree XmlTree
+remove_byline =
+ processTopDown ((none) `when` is_byline)
+
+
+remove_images :: (ArrowXml a) => a XmlTree XmlTree
+remove_images =
+ processTopDown ((none) `when` is_image)
+
+
parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
parse_article_body xml = do
-- First, delete the article title and byline.
- --let clean_xml = xml >>> processTopDown ((none) `when` is_title) >>> processTopDown ((none) `when` is_byline)
- let clean_xml = xml
- clean_html <- runX $ xshow clean_xml
+ let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images
+ -- The only child of the body element should be a div.lwn-article
+ -- since we wrapped the article's HTML in that.
+ let clean_xml = clean_xml' >>> css "body" >>> getChildren
+ clean_html <- runX . xshow $ clean_xml
return $ case clean_html of
[x] -> Just x
[] -> Nothing
parsed_article_title <- parse_article_title xml
parsed_article_byline <- parse_article_byline xml
parsed_article_body <- parse_article_body xml
--- let title' = fromJust parsed_article_title
- let title' = "title"
--- let byline' = fromJust parsed_article_byline
- let byline' = "byline"
--- let body' = fromJust parsed_article_body
- body' <- runX . xshow $ xml
- return $ Just $ Article title' byline' (body' !! 0)
+ let title' = Title $ fromJust parsed_article_title
+ let byline' = Byline parsed_article_byline
+ let body' = BodyHtml $ fromJust parsed_article_body
+ return $ Just $ Article title' byline' body'
parse_html_article :: String -> IO (Maybe Article)
parse_html_article html = do
- let xml = parseHtml html
+ let xml = parseHtml $ wrap_in_body_div html
parse_article xml
xml >>> css "div.ArticleText"
+-- Debug, print a string.
+print_article :: String -> IO ()
+print_article s = do
+ putStrLn "-----------"
+ putStrLn "- Article -"
+ putStrLn "-----------"
+ putStrLn ""
+ putStrLn s
+ putStrLn ""
+
+
+-- Debug, print an article's body html.
+print_body :: Article -> IO ()
+print_body x =
+ print_article bh
+ where
+ bh' = body_html x
+ bh = getBodyHtml bh'
+
parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
parse_articles xml = do
- bodies <- runX . xshow $ (parse_bodies xml) >>> indentDoc
+ bodies <- runX . xshow $ parse_bodies xml
let article_separator = "<h2 class=\"SummaryHL\">"
- let split_articles'' = split article_separator (bodies !! 0)
+ let split_articles'' = split article_separator (concat bodies)
-- The first element will contain the crap before the first <h2...>.
let split_articles' = tail split_articles''
-- Put the separator back, it was lost during the split.
- let split_articles = map (("\n" ++ article_separator) ++) split_articles'
- putStrLn "split articles\n\n"
- mapM putStrLn split_articles
+ let split_articles = map (article_separator ++) split_articles'
+ --_ <- mapM print_article split_articles
real_articles <- mapM parse_html_article split_articles
let just_articles = catMaybes real_articles
return just_articles
+wrap_in_body_div :: String -> String
+wrap_in_body_div s =
+ "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
where
import Data.Maybe (fromJust)
-import Text.HandsomeSoup (parseHtml)
+import Text.XML.HXT.Core -- (SysConfigList, IOStateArrow, XmlTree, readDocument)
import Epublishable
import LWN.ArticlePage
import LWN.FullPage
-import XHTML
+
+my_read :: String -> IOStateArrow s b XmlTree
+my_read =
+ readDocument [ withValidate no,
+ withParseHTML yes,
+ withInputEncoding utf8,
+ withWarnings no ]
main :: IO ()
main = do
- article_html <- readFile "test/fixtures/501317-article.html"
- ioap <- parse $ parseHtml article_html
+ let article_html = my_read "test/fixtures/501317-article.html"
+ ioap <- parse article_html
let article_page :: ArticlePage = fromJust $ ioap
epublish article_page "single_article.epub"
- page_html <- readFile "test/fixtures/500848-page.html"
- ioap_f <- parse $ parseHtml page_html
+ let page_html = my_read "test/fixtures/500848-page.html"
+ ioap_f <- parse page_html
let full_page :: FullPage = fromJust $ ioap_f
- --putStrLn $ to_xhtml full_page
- --epublish full_page "full_page.epub"
+ epublish full_page "full_page.epub"
+
+ let bigpage_html = my_read "test/fixtures/50844-bigpage.html"
+ ioap_bp <- parse bigpage_html
+ let bigpage :: FullPage = fromJust $ ioap_bp
+ epublish bigpage "bigpage.epub"
+
putStrLn "Done."