1 {-# LANGUAGE DoAndIfThenElse #-}
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.Map as Map (lookup)
8 import Data.Time (getCurrentTime)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import qualified Data.Map as Map (Map, empty, insert)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Prelude hiding (readFile)
14 import System.IO (Handle, hClose, hFlush)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
24 import Text.XML.HXT.Core (
43 import Text.HandsomeSoup (css, parseHtml)
45 import Configuration (Cfg)
70 -- | An LWN page with one article on it.
71 ArticlePage { article :: Article } |
73 -- | An LWN page with more than one article on it. These require
74 -- different parsing and display functions than the single-article
76 FullPage { headline :: String,
77 articles :: [Article] }
80 instance XHTML Page where
81 to_xhtml (ArticlePage a) =
82 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
83 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
84 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
86 " <meta http-equiv=\"Content-Type\"" ++
87 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
88 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
95 to_xhtml (FullPage hl as) =
96 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
97 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
98 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
100 " <meta http-equiv=\"Content-Type\"" ++
101 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
102 " <title>" ++ hl ++ "</title>" ++
106 "<h1>" ++ hl ++ "</h1>" ++
107 (concatMap to_xhtml as) ++
114 page_from_url :: Cfg -> URL -> IO (Maybe Page)
115 page_from_url cfg url = do
116 contents <- get_article_contents cfg url
117 case (xml_from_contents contents) of
118 Just html -> parse cfg html
119 Nothing -> return Nothing
123 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
124 insert_full_stories story_map =
125 processTopDown (article_xml `when` full_story_paragraph)
127 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
129 case Map.lookup href story_map of
130 -- Leave it alone if we don't have the full story.
134 article_xml :: (ArrowXml a) => a XmlTree XmlTree
138 (this /> full_story_link >>> getAttrValue "href")
140 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
141 replace_remote_img_srcs image_map =
142 processTopDown (make_srcs_local `when` is_image)
144 -- old_src -> new_src
145 change_src_func :: String -> String
146 change_src_func old_src =
147 case Map.lookup old_src image_map of
148 -- Leave it alone if we don't have the file locally
152 change_src :: (ArrowXml a) => a XmlTree XmlTree
154 changeAttrValue change_src_func
156 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
158 processAttrl $ (change_src `when` (hasName "src"))
163 -- Should be called *after* preprocessing.
164 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
165 download_images xml = do
166 image_urls <- runX $ xml >>> image_srcs
167 download_image_urls image_urls
171 type StoryMap = Map.Map URL Article
173 -- These come *before* preprocessing.
174 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
175 download_full_story_urls cfg story_urls = do
176 pages <- parallel $ map (page_from_url cfg) story_urls
177 let pairs = zip story_urls pages
178 return $ foldl my_insert empty_map pairs
180 empty_map = Map.empty :: StoryMap
182 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
183 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
184 my_insert dict (_, _) = dict
187 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
188 download_full_stories cfg xml = do
189 story_urls <- runX $ xml >>> full_story_urls
190 download_full_story_urls cfg story_urls
193 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
195 story_map <- download_full_stories cfg xml
196 let fs_xml = xml >>> insert_full_stories story_map
198 let clean_xml = fs_xml >>> preprocess
199 image_map <- download_images clean_xml
200 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
202 appr <- ap_parse local_xml
203 fppr <- fp_parse local_xml
205 if (isNothing appr) then
212 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
213 parse_headline xml = do
214 let element_filter = xml >>> css "div.PageHeadline h1"
215 let element_text_filter = element_filter /> getText
216 element_text <- runX element_text_filter
219 [x] -> Just $ strip x
221 _ -> error "Found more than one headline."
224 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
225 parse_byline xml = do
226 let element_filter = xml >>> css "div.FeatureByLine"
227 let element_text_filter = element_filter /> getText
228 element_text <- runX element_text_filter
231 [x] -> Just $ strip x
233 _ -> error "Found more than one article byline."
239 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
241 arts <- ap_parse_articles xml
243 [x] -> return $ Just $ ArticlePage x
247 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
248 ap_parse_body xml = do
249 let element_filter = xml >>> css "div.ArticleText"
250 let element_html_filter = xshow element_filter
251 element_html <- runX element_html_filter
252 return $ case element_html of
255 _ -> error "Found more than one article."
258 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
259 ap_parse_articles xml = do
260 parsed_headline <- parse_headline xml
261 parsed_byline <- parse_byline xml
262 parsed_body <- ap_parse_body xml
264 putStrLn $ fromJust parsed_headline
266 if (isNothing parsed_headline) || (isNothing parsed_body)
269 let title' = Title $ fromJust parsed_headline
270 let byline' = Byline parsed_byline
271 let body' = BodyHtml $ fromJust parsed_body
273 return $ [Article title' byline' body']
281 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
283 hl <- parse_headline xml
284 parsed_articles <- fp_parse_articles xml
285 case parsed_articles of
287 x -> return $ Just $ FullPage (fromJust hl) x
291 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
292 fp_parse_article_title xml = do
293 let element_filter = xml >>> css "h2.SummaryHL"
294 let element_text_filter = element_filter //> getText
295 element_text <- runX element_text_filter
296 return $ case element_text of
297 [x] -> Just $ strip x
299 _ -> error "Found more than one article title."
304 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
305 fp_parse_article_body xml = do
306 -- First, delete the article title and byline.
307 let clean_xml' = xml >>> remove_title >>> remove_byline
308 -- The only child of the body element should be a div.lwn-article
309 -- since we wrapped the article's HTML in that.
310 let clean_xml = clean_xml' >>> css "body" >>> getChildren
311 clean_html <- runX . xshow $ clean_xml
312 return $ case clean_html of
315 _ -> error "Found more than one article body."
317 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
318 fp_parse_article xml = do
319 parsed_article_title <- fp_parse_article_title xml
320 parsed_article_byline <- parse_byline xml
321 parsed_article_body <- fp_parse_article_body xml
323 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
327 let title' = Title $ fromJust parsed_article_title
328 let byline' = Byline parsed_article_byline
329 let body' = BodyHtml $ fromJust parsed_article_body
330 return $ Just $ Article title' byline' body'
332 parse_html_article :: String -> IO (Maybe Article)
333 parse_html_article html = do
334 let xml = parseHtml $ wrap_in_body_div html
338 -- | In the full page, all of the article titles and bodies are
339 -- wrapped in one big div.ArticleText.
340 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
342 xml >>> css "div.ArticleText"
345 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
346 fp_parse_articles xml = do
347 bodies <- runX . xshow $ parse_bodies xml
348 let article_separator = "<h2 class=\"SummaryHL\">"
349 let split_articles'' = split article_separator (concat bodies)
350 -- The first element will contain the crap before the first <h2...>.
351 let split_articles' = case split_articles'' of
352 (_:_) -> tail split_articles''
354 -- Put the separator back, it was lost during the split.
355 let split_articles = map (article_separator ++) split_articles'
356 real_articles <- mapM parse_html_article split_articles
357 let just_articles = catMaybes real_articles
361 -- | This makes it easy to select otherwise-random chunks of html
363 wrap_in_body_div :: String -> String
365 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
371 -- Epublishable stuff
374 title :: Page -> String
375 title (ArticlePage a) = getTitle $ LWN.Article.title a
376 title (FullPage hl _) = hl
379 metadata :: Page -> IO String
381 date <- getCurrentTime
383 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
384 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
385 "<dc:language>en-US</dc:language>\n" ++
386 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
387 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
390 epublish :: Page -> Handle -> IO ()
391 epublish obj handle = do
392 let xhtml = to_xhtml obj
394 epub <- xhtml_to_epub epmd xhtml
399 xhtml_to_epub :: String -> String -> IO B.ByteString
401 write_epub . read_html
403 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
404 write_epub = writeEPUB Nothing [] my_writer_options
405 read_html = readHtml defaultParserState
412 test_preprocess_links :: Assertion
413 test_preprocess_links = do
414 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
415 let actual_xml = actual_xml' !! 0
417 expected_xml' <- runX $ expected_xml'' >>> css "body"
418 let expected_xml = expected_xml' !! 0
421 "Links are replaced with spans"
425 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
426 input_xml = parseHtml input_html
427 expected_html = "<body><span>Hello, world!</span></body>"
428 expected_xml'' = parseHtml expected_html
431 test_absolve_images :: Assertion
432 test_absolve_images = do
433 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
434 let actual_xml = actual_xml' !! 0
436 expected_xml' <- runX $ expected_xml'' >>> css "body"
437 let expected_xml = expected_xml' !! 0
440 "Image srcs are made absolute"
446 "<img src=\"/images/2012/example.jpg\" />" ++
448 input_xml = parseHtml input_html
451 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
453 expected_xml'' = parseHtml expected_html
456 test_comments_removed :: Assertion
457 test_comments_removed = do
458 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
459 let actual_xml = actual_xml' !! 0
461 expected_xml' <- runX $ expected_xml'' >>> css "body"
462 let expected_xml = expected_xml' !! 0
465 "Comment links are removed"
471 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
473 input_xml = parseHtml input_html
475 expected_html = "<body><p></p></body>"
476 expected_xml'' = parseHtml expected_html
479 test_full_story_urls_parsed :: Assertion
480 test_full_story_urls_parsed = do
481 actual <- runX $ actual'
484 "Full Story URLs are parsed"
488 expected = ["/Articles/500738/", "/Articles/501837/"]
492 "<a href=\"/Articles/500738/\">Full Story</a> ",
493 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
495 "<a href=\"/Articles/501837/\">Full Story</a> ",
496 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
499 xml = parseHtml full_story_html
500 actual' = xml >>> full_story_urls
504 testGroup "Page Tests" [
505 testCase "Links are replaced with spans" test_preprocess_links,
506 testCase "Image srcs are made absolute" test_absolve_images,
507 testCase "Comment links are removed" test_comments_removed,
508 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]