1 {-# LANGUAGE DoAndIfThenElse #-}
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.Map as Map (lookup)
8 import Data.Time (getCurrentTime)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import qualified Data.Map as Map (Map, empty, insert)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Prelude hiding (readFile)
14 import System.IO (Handle, hClose, hFlush)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
25 import Text.Pandoc.Shared ( readDataFile )
26 import Text.XML.HXT.Core (
46 import Text.HandsomeSoup (css, parseHtml)
48 import Configuration (Cfg, full_stories)
64 remove_full_story_paragraphs,
74 -- | An LWN page with one article on it.
75 ArticlePage { article :: Article } |
77 -- | An LWN page with more than one article on it. These require
78 -- different parsing and display functions than the single-article
80 FullPage { headline :: String,
81 articles :: [Article] }
84 instance XHTML Page where
85 to_xhtml (ArticlePage a) =
86 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
87 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
88 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
90 " <meta http-equiv=\"Content-Type\"" ++
91 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
92 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
99 to_xhtml (FullPage hl as) =
100 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
101 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
102 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
104 " <meta http-equiv=\"Content-Type\"" ++
105 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
106 " <title>" ++ hl ++ "</title>" ++
110 "<h1>" ++ hl ++ "</h1>" ++
111 (concatMap to_xhtml as) ++
118 -- | Stolen from writeEPUB.
119 default_stylesheet :: IO String
121 -- This comes with Pandoc, I guess.
122 readDataFile (writerUserDataDir defaultWriterOptions) "epub.css"
125 construct_stylesheet :: IO String
126 construct_stylesheet = do
127 defaults <- default_stylesheet
128 -- Allow word-wrapping in <pre> elements.
129 let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
130 return $ defaults ++ my_additions
132 page_from_url :: Cfg -> URL -> IO (Maybe Page)
133 page_from_url cfg url = do
134 contents <- get_article_contents cfg url
135 case (xml_from_contents contents) of
136 Just html -> parse cfg html
137 Nothing -> return Nothing
141 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
142 insert_full_stories story_map =
143 processTopDown (article_xml `when` full_story_paragraph)
145 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
147 case Map.lookup href story_map of
148 -- Drop the paragraph if we don't have the contents.
152 article_xml :: (ArrowXml a) => a XmlTree XmlTree
155 $< -- From HXT's Control.Arrow.ArrowList
156 (this /> full_story_link >>> getAttrValue "href")
158 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
159 replace_remote_img_srcs image_map =
160 processTopDown (make_srcs_local `when` is_image)
162 -- old_src -> new_src
163 change_src_func :: String -> String
164 change_src_func old_src =
165 case Map.lookup old_src image_map of
166 -- If we don't have the file, empty the src. Pandoc will crash
171 change_src :: (ArrowXml a) => a XmlTree XmlTree
173 changeAttrValue change_src_func
175 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
177 processAttrl $ (change_src `when` (hasName "src"))
182 -- Should be called *after* preprocessing.
183 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
184 download_images xml = do
185 image_urls <- runX $ xml >>> image_srcs
186 download_image_urls image_urls
190 type StoryMap = Map.Map URL Article
192 -- These come *before* preprocessing.
193 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
194 download_full_story_urls cfg story_urls = do
195 pages <- parallel $ map (page_from_url cfg) story_urls
196 let pairs = zip story_urls pages
197 return $ foldl my_insert empty_map pairs
199 empty_map = Map.empty :: StoryMap
201 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
202 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
203 my_insert dict (_, _) = dict
206 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
207 download_full_stories cfg xml = do
208 story_urls <- runX $ xml >>> full_story_urls
209 download_full_story_urls cfg story_urls
212 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
214 fs_xml <- if (full_stories cfg) then do
215 story_map <- download_full_stories cfg xml
216 return $ xml >>> insert_full_stories story_map
218 -- Get rid of them if we don't want them.
219 return $ xml >>> remove_full_story_paragraphs
221 let clean_xml = fs_xml >>> preprocess
222 image_map <- download_images clean_xml
223 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
225 appr <- ap_parse local_xml
226 fppr <- fp_parse local_xml
228 if (isNothing appr) then
235 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
236 parse_headline xml = do
237 let element_filter = xml >>> css "div.PageHeadline h1"
238 let element_text_filter = element_filter /> getText
239 element_text <- runX element_text_filter
242 [x] -> Just $ strip x
244 _ -> error "Found more than one headline."
247 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
248 parse_byline xml = do
249 let element_filter = xml >>> css "div.FeatureByLine"
250 let element_text_filter = element_filter /> getText
251 element_text <- runX element_text_filter
254 [x] -> Just $ strip x
256 _ -> error "Found more than one article byline."
262 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
264 arts <- ap_parse_articles xml
266 [x] -> return $ Just $ ArticlePage x
270 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
271 ap_parse_body xml = do
272 let element_filter = xml >>> css "div.ArticleText"
273 let element_html_filter = xshow element_filter
274 element_html <- runX element_html_filter
275 return $ case element_html of
278 _ -> error "Found more than one article."
281 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
282 ap_parse_articles xml = do
283 parsed_headline <- parse_headline xml
284 parsed_byline <- parse_byline xml
285 parsed_body <- ap_parse_body xml
287 if (isNothing parsed_headline) || (isNothing parsed_body)
290 let title' = Title $ fromJust parsed_headline
291 let byline' = Byline parsed_byline
292 let body' = BodyHtml $ fromJust parsed_body
294 return $ [Article title' byline' body']
302 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
304 hl <- parse_headline xml
305 parsed_articles <- fp_parse_articles xml
306 return $ case parsed_articles of
308 x -> Just $ FullPage (fromJust hl) x
312 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
313 fp_parse_article_title xml = do
314 let element_filter = xml >>> css "h2.SummaryHL"
315 let element_text_filter = element_filter //> getText
316 element_text <- runX element_text_filter
317 return $ case element_text of
318 [x] -> Just $ strip x
320 _ -> error "Found more than one article title."
325 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
326 fp_parse_article_body xml = do
327 -- First, delete the article title and byline.
328 let clean_xml' = xml >>> remove_title >>> remove_byline
329 -- The only child of the body element should be a div.lwn-article
330 -- since we wrapped the article's HTML in that.
331 let clean_xml = clean_xml' >>> css "body" >>> getChildren
332 clean_html <- runX . xshow $ clean_xml
333 return $ case clean_html of
336 _ -> error "Found more than one article body."
338 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
339 fp_parse_article xml = do
340 parsed_article_title <- fp_parse_article_title xml
341 parsed_article_byline <- parse_byline xml
342 parsed_article_body <- fp_parse_article_body xml
344 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
348 let title' = Title $ fromJust parsed_article_title
349 let byline' = Byline parsed_article_byline
350 let body' = BodyHtml $ fromJust parsed_article_body
351 return $ Just $ Article title' byline' body'
353 parse_html_article :: String -> IO (Maybe Article)
354 parse_html_article html = do
355 let xml = parseHtml $ wrap_in_body_div html
359 -- | In the full page, all of the article titles and bodies are
360 -- wrapped in one big div.ArticleText.
361 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
363 xml >>> css "div.ArticleText"
366 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
367 fp_parse_articles xml = do
368 bodies <- runX . xshow $ parse_bodies xml
369 let article_separator = "<h2 class=\"SummaryHL\">"
370 let split_articles'' = split article_separator (concat bodies)
371 -- The first element will contain the crap before the first <h2...>.
372 let split_articles' = case split_articles'' of
373 (_:_) -> tail split_articles''
375 -- Put the separator back, it was lost during the split.
376 let split_articles = map (article_separator ++) split_articles'
377 real_articles <- mapM parse_html_article split_articles
378 let just_articles = catMaybes real_articles
382 -- | This makes it easy to select otherwise-random chunks of html
384 wrap_in_body_div :: String -> String
386 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
392 -- Epublishable stuff
395 title :: Page -> String
396 title (ArticlePage a) = getTitle $ LWN.Article.title a
397 title (FullPage hl _) = hl
400 metadata :: Page -> IO String
402 date <- getCurrentTime
404 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
405 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
406 "<dc:language>en-US</dc:language>\n" ++
407 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
408 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
411 epublish :: Page -> Handle -> IO ()
412 epublish obj handle = do
413 let xhtml = to_xhtml obj
415 epub <- xhtml_to_epub epmd xhtml
420 xhtml_to_epub :: String -> String -> IO B.ByteString
421 xhtml_to_epub epmd xhtml = do
422 stylesheet <- construct_stylesheet
429 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
430 read_html = readHtml defaultParserState
437 test_preprocess_links :: Assertion
438 test_preprocess_links = do
439 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
440 let actual_xml = actual_xml' !! 0
442 expected_xml' <- runX $ expected_xml'' >>> css "body"
443 let expected_xml = expected_xml' !! 0
446 "Links are replaced with spans"
450 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
451 input_xml = parseHtml input_html
452 expected_html = "<body><span>Hello, world!</span></body>"
453 expected_xml'' = parseHtml expected_html
456 test_absolve_images :: Assertion
457 test_absolve_images = do
458 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
459 let actual_xml = actual_xml' !! 0
461 expected_xml' <- runX $ expected_xml'' >>> css "body"
462 let expected_xml = expected_xml' !! 0
465 "Image srcs are made absolute"
471 "<img src=\"/images/2012/example.jpg\" />" ++
473 input_xml = parseHtml input_html
476 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
478 expected_xml'' = parseHtml expected_html
481 test_comments_removed :: Assertion
482 test_comments_removed = do
483 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
484 let actual_xml = actual_xml' !! 0
486 expected_xml' <- runX $ expected_xml'' >>> css "body"
487 let expected_xml = expected_xml' !! 0
490 "Comment links are removed"
496 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
498 input_xml = parseHtml input_html
500 expected_html = "<body><p></p></body>"
501 expected_xml'' = parseHtml expected_html
504 test_full_story_urls_parsed :: Assertion
505 test_full_story_urls_parsed = do
506 actual <- runX $ actual'
509 "Full Story URLs are parsed"
513 expected = ["/Articles/500738/", "/Articles/501837/"]
517 "<a href=\"/Articles/500738/\">Full Story</a> ",
518 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
520 "<a href=\"/Articles/501837/\">Full Story</a> ",
521 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
524 xml = parseHtml full_story_html
525 actual' = xml >>> full_story_urls
529 testGroup "Page Tests" [
530 testCase "Links are replaced with spans" test_preprocess_links,
531 testCase "Image srcs are made absolute" test_absolve_images,
532 testCase "Comment links are removed" test_comments_removed,
533 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]