1 {-# LANGUAGE DoAndIfThenElse #-}
6 import qualified Data.Map as Map
7 import Data.Time (getCurrentTime)
8 import System.IO (Handle)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.List (isInfixOf)
11 import Data.String.Utils (split, strip)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Data.Tree.NTree.TypeDefs (NTree)
14 import Test.HUnit (Assertion, assertEqual)
15 import Test.Framework (Test, testGroup)
16 import Test.Framework.Providers.HUnit (testCase)
18 import Text.XML.HXT.Core (
42 import Text.HandsomeSoup (css, parseHtml)
45 import LWN.HTTP (save_image)
46 import LWN.URI (URL, try_make_absolute_url)
49 -- Map absolute image URLs to local system file paths where the image
50 -- referenced by the URL is stored.
51 type ImageMap = Map.Map URL FilePath
53 -- Should be called *after* preprocessing.
54 download_images :: IOSArrow XmlTree (NTree XNode) -> IO ImageMap
55 download_images xml = do
56 image_urls <- runX $ xml >>> image_srcs
57 files <- mapM save_image image_urls
58 let pairs = zip image_urls files
59 return $ foldl my_insert empty_map pairs
61 empty_map = Map.empty :: ImageMap
63 my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap
64 my_insert dict (_, Nothing) = dict
65 my_insert dict (k, Just v) = Map.insert k v dict
69 -- | An LWN page with one article on it.
70 ArticlePage { article :: Article } |
72 -- | An LWN page with more than one article on it. These require
73 -- different parsing and display functions than the single-article
75 FullPage { headline :: String,
76 articles :: [Article] }
79 instance XHTML Page where
80 to_xhtml (ArticlePage a) =
81 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
82 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
83 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
85 " <meta http-equiv=\"Content-Type\"" ++
86 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
87 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
96 to_xhtml (FullPage hl as) =
97 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
98 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
99 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
101 " <meta http-equiv=\"Content-Type\"" ++
102 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
103 " <title>" ++ hl ++ "</title>" ++
107 "<h1>" ++ hl ++ "</h1>" ++
108 (concatMap to_xhtml as) ++
115 is_link :: (ArrowXml a) => a XmlTree XmlTree
117 isElem >>> hasName "a"
120 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
121 remove_comment_links =
122 processTopDown $ kill_comments `when` is_link
127 hasAttrValue "href" (contains "#Comments")
130 none `when` is_comment_link
132 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
133 replace_links_with_spans =
134 processTopDown $ (make_span >>> remove_attrs) `when` is_link
136 make_span = setElemName $ mkName "span"
137 remove_attrs = processAttrl none
140 -- | Preprocessing common to both page types.
141 preprocess :: (ArrowXml a) => a XmlTree XmlTree
143 make_image_srcs_absolute
147 replace_links_with_spans
150 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
151 replace_remote_img_srcs image_map =
152 processTopDown (make_srcs_local `when` is_image)
154 -- old_src -> new_src
155 change_src_func :: String -> String
156 change_src_func old_src =
157 case Map.lookup old_src image_map of
158 -- Leave it alone if we don't have the file locally
162 change_src :: (ArrowXml a) => a XmlTree XmlTree
164 changeAttrValue change_src_func
166 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
168 processAttrl $ (change_src `when` (hasName "src"))
171 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
173 let clean_xml = xml >>> preprocess
174 image_map <- download_images clean_xml
175 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
176 appr <- ap_parse local_xml
177 fppr <- fp_parse local_xml
179 if (isNothing appr) then
186 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
187 parse_headline xml = do
188 let element_filter = xml >>> css "div.PageHeadline h1"
189 let element_text_filter = element_filter /> getText
190 element_text <- runX element_text_filter
193 [x] -> Just $ strip x
195 _ -> error "Found more than one headline."
198 parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
199 parse_byline xml = do
200 let element_filter = xml >>> css "div.FeatureByLine"
201 let element_text_filter = element_filter /> getText
202 element_text <- runX element_text_filter
205 [x] -> Just $ strip x
207 _ -> error "Found more than one article byline."
213 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
215 arts <- ap_parse_articles xml
217 [x] -> return $ Just $ ArticlePage x
221 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
222 ap_parse_body xml = do
223 let element_filter = xml >>> css "div.ArticleText"
224 let element_html_filter = xshow element_filter
225 element_html <- runX element_html_filter
226 return $ case element_html of
229 _ -> error "Found more than one article."
232 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
233 ap_parse_articles xml = do
234 parsed_headline <- parse_headline xml
235 parsed_byline <- parse_byline xml
236 parsed_body <- ap_parse_body xml
238 putStrLn $ fromJust parsed_headline
240 if (isNothing parsed_headline) || (isNothing parsed_body)
243 let title' = Title $ fromJust parsed_headline
244 let byline' = Byline parsed_byline
245 let body' = BodyHtml $ fromJust parsed_body
247 return $ [Article title' byline' body']
255 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
257 hl <- parse_headline xml
258 parsed_articles <- fp_parse_articles xml
259 case parsed_articles of
261 x -> return $ Just $ FullPage (fromJust hl) x
265 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
266 fp_parse_article_title xml = do
267 let element_filter = xml >>> css "h2.SummaryHL"
268 let element_text_filter = element_filter //> getText
269 element_text <- runX element_text_filter
270 return $ case element_text of
271 [x] -> Just $ strip x
273 _ -> error "Found more than one article title."
277 is_title :: (ArrowXml a) => a XmlTree XmlTree
281 (hasAttrValue "class" (== "SummaryHL"))
284 is_byline :: (ArrowXml a) => a XmlTree XmlTree
288 (hasAttrValue "class" (== "FeatureByLine"))
291 is_image :: (ArrowXml a) => a XmlTree XmlTree
292 is_image = isElem >>> hasName "img"
294 remove_title :: (ArrowXml a) => a XmlTree XmlTree
296 processTopDown ((none) `when` is_title)
299 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
301 processTopDown ((none) `when` is_byline)
305 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
306 fp_parse_article_body xml = do
307 -- First, delete the article title and byline.
308 let clean_xml' = xml >>> remove_title >>> remove_byline
309 -- The only child of the body element should be a div.lwn-article
310 -- since we wrapped the article's HTML in that.
311 let clean_xml = clean_xml' >>> css "body" >>> getChildren
312 clean_html <- runX . xshow $ clean_xml
313 return $ case clean_html of
316 _ -> error "Found more than one article body."
318 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
319 fp_parse_article xml = do
320 parsed_article_title <- fp_parse_article_title xml
321 parsed_article_byline <- parse_byline xml
322 parsed_article_body <- fp_parse_article_body xml
324 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
328 let title' = Title $ fromJust parsed_article_title
329 let byline' = Byline parsed_article_byline
330 let body' = BodyHtml $ fromJust parsed_article_body
331 return $ Just $ Article title' byline' body'
333 parse_html_article :: String -> IO (Maybe Article)
334 parse_html_article html = do
335 let xml = parseHtml $ wrap_in_body_div html
339 -- | In the full page, all of the article titles and bodies are
340 -- wrapped in one big div.ArticleText.
341 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
343 xml >>> css "div.ArticleText"
346 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
347 fp_parse_articles xml = do
348 bodies <- runX . xshow $ parse_bodies xml
349 let article_separator = "<h2 class=\"SummaryHL\">"
350 let split_articles'' = split article_separator (concat bodies)
351 -- The first element will contain the crap before the first <h2...>.
352 let split_articles' = case split_articles'' of
353 (_:_) -> tail split_articles''
355 -- Put the separator back, it was lost during the split.
356 let split_articles = map (article_separator ++) split_articles'
357 --_ <- mapM print_article split_articles
358 real_articles <- mapM parse_html_article split_articles
359 let just_articles = catMaybes real_articles
363 -- | This makes it easy to select otherwise-random chunks of html
365 wrap_in_body_div :: String -> String
367 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
373 -- Epublishable stuff
376 title :: Page -> String
377 title (ArticlePage a) = getTitle $ LWN.Article.title a
378 title (FullPage hl _) = hl
381 metadata :: Page -> IO String
383 date <- getCurrentTime
385 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
386 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
387 "<dc:language>en-US</dc:language>\n" ++
388 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
389 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
392 epublish :: Page -> Handle -> IO ()
393 epublish obj handle = do
394 let xhtml = to_xhtml obj
396 epub <- xhtml_to_epub epmd xhtml
400 xhtml_to_epub :: String -> String -> IO B.ByteString
402 write_epub . read_html
404 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
405 write_epub = writeEPUB Nothing [] my_writer_options
406 read_html = readHtml defaultParserState
414 image_srcs :: (ArrowXml a) => a XmlTree URL
420 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
421 make_image_srcs_absolute =
422 processTopDown (make_srcs_absolute `when` is_image)
424 change_src :: (ArrowXml a) => a XmlTree XmlTree
426 changeAttrValue try_make_absolute_url
428 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
430 processAttrl $ change_src `when` hasName "src"
437 test_preprocess_links :: Assertion
438 test_preprocess_links = do
439 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
440 let actual_xml = actual_xml' !! 0
442 expected_xml' <- runX $ expected_xml'' >>> css "body"
443 let expected_xml = expected_xml' !! 0
446 "Links are replaced with spans"
450 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
451 input_xml = parseHtml input_html
452 expected_html = "<body><span>Hello, world!</span></body>"
453 expected_xml'' = parseHtml expected_html
456 test_absolve_images :: Assertion
457 test_absolve_images = do
458 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
459 let actual_xml = actual_xml' !! 0
461 expected_xml' <- runX $ expected_xml'' >>> css "body"
462 let expected_xml = expected_xml' !! 0
465 "Image srcs are made absolute"
471 "<img src=\"/images/2012/example.jpg\" />" ++
473 input_xml = parseHtml input_html
476 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
478 expected_xml'' = parseHtml expected_html
481 test_comments_removed :: Assertion
482 test_comments_removed = do
483 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
484 let actual_xml = actual_xml' !! 0
486 expected_xml' <- runX $ expected_xml'' >>> css "body"
487 let expected_xml = expected_xml' !! 0
490 "Comment links are removed"
496 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
498 input_xml = parseHtml input_html
500 expected_html = "<body><p></p></body>"
501 expected_xml'' = parseHtml expected_html
507 testGroup "Page Tests" [
508 testCase "Links are replaced with spans" test_preprocess_links,
509 testCase "Image srcs are made absolute" test_absolve_images,
510 testCase "Comment links are removed" test_comments_removed ]