5 import Data.Time (getCurrentTime)
6 import System.IO (Handle)
7 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
8 import Data.String.Utils (split, strip)
9 import Data.Maybe (catMaybes, fromJust, isNothing)
10 import Data.Tree.NTree.TypeDefs (NTree)
11 import Test.HUnit (Assertion, assertEqual)
12 import Test.Framework (Test, testGroup)
13 import Test.Framework.Providers.HUnit (testCase)
14 import Text.XML.HXT.Core (
35 import Text.HandsomeSoup (css, parseHtml)
41 -- | An LWN page with one article on it.
42 ArticlePage { article :: Article } |
44 -- | An LWN page with more than one article on it. These require
45 -- different parsing and display functions than the single-article
47 FullPage { headline :: String,
48 articles :: [Article] }
51 instance XHTML Page where
52 to_xhtml (ArticlePage a) =
53 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
54 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
55 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
57 " <meta http-equiv=\"Content-Type\"" ++
58 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
59 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
68 to_xhtml (FullPage hl as) =
69 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
70 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
71 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
73 " <meta http-equiv=\"Content-Type\"" ++
74 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
75 " <title>" ++ hl ++ "</title>" ++
79 "<h1>" ++ hl ++ "</h1>" ++
80 (concatMap to_xhtml as) ++
87 remove_images :: (ArrowXml a) => a XmlTree XmlTree
89 processTopDown ((none) `when` is_image)
92 is_link :: (ArrowXml a) => a XmlTree XmlTree
96 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
97 replace_links_with_spans =
98 processTopDown $ (make_span >>> remove_attrs) `when` is_link
100 make_span = setElemName $ mkName "span"
101 remove_attrs = processAttrl none
103 -- | Preprocessing common to both page types.
104 preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
106 xml >>> remove_images >>> replace_links_with_spans
109 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
111 let clean_xml = preprocess xml
112 appr <- ap_parse clean_xml
113 fppr <- fp_parse clean_xml
115 if (isNothing appr) then
122 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
123 parse_headline xml = do
124 let element_filter = xml >>> css "div.PageHeadline h1"
125 let element_text_filter = element_filter /> getText
126 element_text <- runX element_text_filter
129 [x] -> Just $ strip x
131 _ -> error "Found more than one headline."
134 parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
135 parse_byline xml = do
136 let element_filter = xml >>> css "div.FeatureByLine"
137 let element_text_filter = element_filter /> getText
138 element_text <- runX element_text_filter
141 [x] -> Just $ strip x
143 _ -> error "Found more than one article byline."
149 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
151 arts <- ap_parse_articles xml
153 Just [x] -> return $ Just $ ArticlePage x
157 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
158 ap_parse_body xml = do
159 let element_filter = xml >>> css "div.ArticleText"
160 let element_html_filter = xshow element_filter
161 element_html <- runX element_html_filter
162 return $ case element_html of
165 _ -> error "Found more than one article."
168 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
169 ap_parse_articles xml = do
170 parsed_headline <- parse_headline xml
171 parsed_byline <- parse_byline xml
172 parsed_body <- ap_parse_body xml
173 let title' = Title (fromJust parsed_headline)
174 let byline' = Byline parsed_byline
175 let body' = BodyHtml (fromJust parsed_body)
176 return $ Just $ [Article title' byline' body']
185 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
187 hl <- parse_headline xml
188 parsed_articles <- fp_parse_articles xml
189 case parsed_articles of
190 them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
195 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
196 fp_parse_article_title xml = do
197 let element_filter = xml >>> css "h2.SummaryHL"
198 let element_text_filter = element_filter //> getText
199 element_text <- runX element_text_filter
200 return $ case element_text of
201 [x] -> Just $ strip x
203 _ -> error "Found more than one article title."
207 is_title :: (ArrowXml a) => a XmlTree XmlTree
211 (hasAttrValue "class" (== "SummaryHL"))
214 is_byline :: (ArrowXml a) => a XmlTree XmlTree
218 (hasAttrValue "class" (== "FeatureByLine"))
221 is_image :: (ArrowXml a) => a XmlTree XmlTree
225 remove_title :: (ArrowXml a) => a XmlTree XmlTree
227 processTopDown ((none) `when` is_title)
230 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
232 processTopDown ((none) `when` is_byline)
236 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
237 fp_parse_article_body xml = do
238 -- First, delete the article title and byline.
239 let clean_xml' = xml >>> remove_title >>> remove_byline
240 -- The only child of the body element should be a div.lwn-article
241 -- since we wrapped the article's HTML in that.
242 let clean_xml = clean_xml' >>> css "body" >>> getChildren
243 clean_html <- runX . xshow $ clean_xml
244 return $ case clean_html of
247 _ -> error "Found more than one article body."
249 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
250 fp_parse_article xml = do
251 parsed_article_title <- fp_parse_article_title xml
252 parsed_article_byline <- parse_byline xml
253 parsed_article_body <- fp_parse_article_body xml
254 let title' = Title $ fromJust parsed_article_title
255 let byline' = Byline parsed_article_byline
256 let body' = BodyHtml $ fromJust parsed_article_body
257 return $ Just $ Article title' byline' body'
259 parse_html_article :: String -> IO (Maybe Article)
260 parse_html_article html = do
261 let xml = parseHtml $ wrap_in_body_div html
265 -- | In the full page, all of the article titles and bodies are
266 -- wrapped in one big div.ArticleText.
267 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
269 xml >>> css "div.ArticleText"
272 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
273 fp_parse_articles xml = do
274 bodies <- runX . xshow $ parse_bodies xml
275 let article_separator = "<h2 class=\"SummaryHL\">"
276 let split_articles'' = split article_separator (concat bodies)
277 -- The first element will contain the crap before the first <h2...>.
278 let split_articles' = tail split_articles''
279 -- Put the separator back, it was lost during the split.
280 let split_articles = map (article_separator ++) split_articles'
281 --_ <- mapM print_article split_articles
282 real_articles <- mapM parse_html_article split_articles
283 let just_articles = catMaybes real_articles
287 -- | This makes it easy to select otherwise-random chunks of html
289 wrap_in_body_div :: String -> String
291 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
297 -- Epublishable stuff
300 title :: Page -> String
301 title (ArticlePage a) = getTitle $ LWN.Article.title a
302 title (FullPage hl _) = hl
305 metadata :: Page -> IO String
307 date <- getCurrentTime
309 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
310 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
311 "<dc:language>en-US</dc:language>\n" ++
312 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
313 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
316 epublish :: Page -> Handle -> IO ()
317 epublish obj handle = do
318 let xhtml = to_xhtml obj
320 epub <- xhtml_to_epub epmd xhtml
324 xhtml_to_epub :: String -> String -> IO B.ByteString
326 write_epub . read_html
328 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
329 write_epub = writeEPUB Nothing [] my_writer_options
330 read_html = readHtml defaultParserState
340 test_preprocess_links :: Assertion
341 test_preprocess_links = do
342 actual_xml' <- runX $ (preprocess input_xml) >>> css "body"
343 let actual_xml = actual_xml' !! 0
345 expected_xml' <- runX $ expected_xml'' >>> css "body"
346 let expected_xml = expected_xml' !! 0
349 "Links are replaced with spans"
353 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
354 input_xml = parseHtml input_html
355 expected_html = "<body><span>Hello, world!</span></body>"
356 expected_xml'' = parseHtml expected_html
360 testGroup "Page Tests" [
361 testCase "Links are replaced with spans" test_preprocess_links ]