1 {-# LANGUAGE DoAndIfThenElse #-}
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.Map as Map (lookup)
8 import Data.Time (getCurrentTime)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import qualified Data.Map as Map (Map, empty, insert)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Prelude hiding (readFile)
14 import System.IO (Handle, hClose, hFlush)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
24 import Text.XML.HXT.Core (
43 import Text.HandsomeSoup (css, parseHtml)
45 import Configuration (Cfg, full_stories)
61 remove_full_story_paragraphs,
71 -- | An LWN page with one article on it.
72 ArticlePage { article :: Article } |
74 -- | An LWN page with more than one article on it. These require
75 -- different parsing and display functions than the single-article
77 FullPage { headline :: String,
78 articles :: [Article] }
81 instance XHTML Page where
82 to_xhtml (ArticlePage a) =
83 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
84 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
85 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
87 " <meta http-equiv=\"Content-Type\"" ++
88 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
89 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
96 to_xhtml (FullPage hl as) =
97 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
98 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
99 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
101 " <meta http-equiv=\"Content-Type\"" ++
102 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
103 " <title>" ++ hl ++ "</title>" ++
107 "<h1>" ++ hl ++ "</h1>" ++
108 (concatMap to_xhtml as) ++
115 page_from_url :: Cfg -> URL -> IO (Maybe Page)
116 page_from_url cfg url = do
117 contents <- get_article_contents cfg url
118 case (xml_from_contents contents) of
119 Just html -> parse cfg html
120 Nothing -> return Nothing
124 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
125 insert_full_stories story_map =
126 processTopDown (article_xml `when` full_story_paragraph)
128 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
130 case Map.lookup href story_map of
131 -- Leave it alone if we don't have the full story.
135 article_xml :: (ArrowXml a) => a XmlTree XmlTree
138 $< -- From HXT's Control.Arrow.ArrowList
139 (this /> full_story_link >>> getAttrValue "href")
141 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
142 replace_remote_img_srcs image_map =
143 processTopDown (make_srcs_local `when` is_image)
145 -- old_src -> new_src
146 change_src_func :: String -> String
147 change_src_func old_src =
148 case Map.lookup old_src image_map of
149 -- Leave it alone if we don't have the file locally
153 change_src :: (ArrowXml a) => a XmlTree XmlTree
155 changeAttrValue change_src_func
157 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
159 processAttrl $ (change_src `when` (hasName "src"))
164 -- Should be called *after* preprocessing.
165 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
166 download_images xml = do
167 image_urls <- runX $ xml >>> image_srcs
168 download_image_urls image_urls
172 type StoryMap = Map.Map URL Article
174 -- These come *before* preprocessing.
175 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
176 download_full_story_urls cfg story_urls = do
177 pages <- parallel $ map (page_from_url cfg) story_urls
178 let pairs = zip story_urls pages
179 return $ foldl my_insert empty_map pairs
181 empty_map = Map.empty :: StoryMap
183 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
184 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
185 my_insert dict (_, _) = dict
188 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
189 download_full_stories cfg xml = do
190 story_urls <- runX $ xml >>> full_story_urls
191 download_full_story_urls cfg story_urls
194 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
196 fs_xml <- if (full_stories cfg) then do
197 story_map <- download_full_stories cfg xml
198 return $ xml >>> insert_full_stories story_map
200 -- Get rid of them if we don't want them.
201 return $ xml >>> remove_full_story_paragraphs
203 let clean_xml = fs_xml >>> preprocess
204 image_map <- download_images clean_xml
205 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
207 appr <- ap_parse local_xml
208 fppr <- fp_parse local_xml
210 if (isNothing appr) then
217 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
218 parse_headline xml = do
219 let element_filter = xml >>> css "div.PageHeadline h1"
220 let element_text_filter = element_filter /> getText
221 element_text <- runX element_text_filter
224 [x] -> Just $ strip x
226 _ -> error "Found more than one headline."
229 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
230 parse_byline xml = do
231 let element_filter = xml >>> css "div.FeatureByLine"
232 let element_text_filter = element_filter /> getText
233 element_text <- runX element_text_filter
236 [x] -> Just $ strip x
238 _ -> error "Found more than one article byline."
244 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
246 arts <- ap_parse_articles xml
248 [x] -> return $ Just $ ArticlePage x
252 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
253 ap_parse_body xml = do
254 let element_filter = xml >>> css "div.ArticleText"
255 let element_html_filter = xshow element_filter
256 element_html <- runX element_html_filter
257 return $ case element_html of
260 _ -> error "Found more than one article."
263 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
264 ap_parse_articles xml = do
265 parsed_headline <- parse_headline xml
266 parsed_byline <- parse_byline xml
267 parsed_body <- ap_parse_body xml
269 if (isNothing parsed_headline) || (isNothing parsed_body)
272 let title' = Title $ fromJust parsed_headline
273 let byline' = Byline parsed_byline
274 let body' = BodyHtml $ fromJust parsed_body
276 return $ [Article title' byline' body']
284 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
286 hl <- parse_headline xml
287 parsed_articles <- fp_parse_articles xml
288 case parsed_articles of
290 x -> return $ Just $ FullPage (fromJust hl) x
294 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
295 fp_parse_article_title xml = do
296 let element_filter = xml >>> css "h2.SummaryHL"
297 let element_text_filter = element_filter //> getText
298 element_text <- runX element_text_filter
299 return $ case element_text of
300 [x] -> Just $ strip x
302 _ -> error "Found more than one article title."
307 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
308 fp_parse_article_body xml = do
309 -- First, delete the article title and byline.
310 let clean_xml' = xml >>> remove_title >>> remove_byline
311 -- The only child of the body element should be a div.lwn-article
312 -- since we wrapped the article's HTML in that.
313 let clean_xml = clean_xml' >>> css "body" >>> getChildren
314 clean_html <- runX . xshow $ clean_xml
315 return $ case clean_html of
318 _ -> error "Found more than one article body."
320 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
321 fp_parse_article xml = do
322 parsed_article_title <- fp_parse_article_title xml
323 parsed_article_byline <- parse_byline xml
324 parsed_article_body <- fp_parse_article_body xml
326 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
330 let title' = Title $ fromJust parsed_article_title
331 let byline' = Byline parsed_article_byline
332 let body' = BodyHtml $ fromJust parsed_article_body
333 return $ Just $ Article title' byline' body'
335 parse_html_article :: String -> IO (Maybe Article)
336 parse_html_article html = do
337 let xml = parseHtml $ wrap_in_body_div html
341 -- | In the full page, all of the article titles and bodies are
342 -- wrapped in one big div.ArticleText.
343 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
345 xml >>> css "div.ArticleText"
348 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
349 fp_parse_articles xml = do
350 bodies <- runX . xshow $ parse_bodies xml
351 let article_separator = "<h2 class=\"SummaryHL\">"
352 let split_articles'' = split article_separator (concat bodies)
353 -- The first element will contain the crap before the first <h2...>.
354 let split_articles' = case split_articles'' of
355 (_:_) -> tail split_articles''
357 -- Put the separator back, it was lost during the split.
358 let split_articles = map (article_separator ++) split_articles'
359 real_articles <- mapM parse_html_article split_articles
360 let just_articles = catMaybes real_articles
364 -- | This makes it easy to select otherwise-random chunks of html
366 wrap_in_body_div :: String -> String
368 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
374 -- Epublishable stuff
377 title :: Page -> String
378 title (ArticlePage a) = getTitle $ LWN.Article.title a
379 title (FullPage hl _) = hl
382 metadata :: Page -> IO String
384 date <- getCurrentTime
386 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
387 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
388 "<dc:language>en-US</dc:language>\n" ++
389 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
390 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
393 epublish :: Page -> Handle -> IO ()
394 epublish obj handle = do
395 let xhtml = to_xhtml obj
397 epub <- xhtml_to_epub epmd xhtml
402 xhtml_to_epub :: String -> String -> IO B.ByteString
404 write_epub . read_html
406 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
407 write_epub = writeEPUB Nothing [] my_writer_options
408 read_html = readHtml defaultParserState
415 test_preprocess_links :: Assertion
416 test_preprocess_links = do
417 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
418 let actual_xml = actual_xml' !! 0
420 expected_xml' <- runX $ expected_xml'' >>> css "body"
421 let expected_xml = expected_xml' !! 0
424 "Links are replaced with spans"
428 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
429 input_xml = parseHtml input_html
430 expected_html = "<body><span>Hello, world!</span></body>"
431 expected_xml'' = parseHtml expected_html
434 test_absolve_images :: Assertion
435 test_absolve_images = do
436 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
437 let actual_xml = actual_xml' !! 0
439 expected_xml' <- runX $ expected_xml'' >>> css "body"
440 let expected_xml = expected_xml' !! 0
443 "Image srcs are made absolute"
449 "<img src=\"/images/2012/example.jpg\" />" ++
451 input_xml = parseHtml input_html
454 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
456 expected_xml'' = parseHtml expected_html
459 test_comments_removed :: Assertion
460 test_comments_removed = do
461 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
462 let actual_xml = actual_xml' !! 0
464 expected_xml' <- runX $ expected_xml'' >>> css "body"
465 let expected_xml = expected_xml' !! 0
468 "Comment links are removed"
474 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
476 input_xml = parseHtml input_html
478 expected_html = "<body><p></p></body>"
479 expected_xml'' = parseHtml expected_html
482 test_full_story_urls_parsed :: Assertion
483 test_full_story_urls_parsed = do
484 actual <- runX $ actual'
487 "Full Story URLs are parsed"
491 expected = ["/Articles/500738/", "/Articles/501837/"]
495 "<a href=\"/Articles/500738/\">Full Story</a> ",
496 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
498 "<a href=\"/Articles/501837/\">Full Story</a> ",
499 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
502 xml = parseHtml full_story_html
503 actual' = xml >>> full_story_urls
507 testGroup "Page Tests" [
508 testCase "Links are replaced with spans" test_preprocess_links,
509 testCase "Image srcs are made absolute" test_absolve_images,
510 testCase "Comment links are removed" test_comments_removed,
511 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]