1 {-# LANGUAGE DoAndIfThenElse #-}
6 import qualified Data.Map as Map (lookup)
7 import Data.Time (getCurrentTime)
8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
9 import Data.String.Utils (split, strip)
10 import Data.Maybe (catMaybes, fromJust, isNothing)
11 import Prelude hiding (readFile)
12 import System.IO (Handle, hClose, hFlush)
13 import Test.HUnit (Assertion, assertEqual)
14 import Test.Framework (Test, testGroup)
15 import Test.Framework.Providers.HUnit (testCase)
22 import Text.XML.HXT.Core (
38 import Text.HandsomeSoup (css, parseHtml)
40 import Configuration (Cfg)
61 -- | An LWN page with one article on it.
62 ArticlePage { article :: Article } |
64 -- | An LWN page with more than one article on it. These require
65 -- different parsing and display functions than the single-article
67 FullPage { headline :: String,
68 articles :: [Article] }
71 instance XHTML Page where
72 to_xhtml (ArticlePage a) =
73 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
74 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
75 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
77 " <meta http-equiv=\"Content-Type\"" ++
78 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
79 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
86 to_xhtml (FullPage hl as) =
87 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
88 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
89 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
91 " <meta http-equiv=\"Content-Type\"" ++
92 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
93 " <title>" ++ hl ++ "</title>" ++
97 "<h1>" ++ hl ++ "</h1>" ++
98 (concatMap to_xhtml as) ++
105 page_from_url :: Cfg -> URL -> IO (Maybe Page)
106 page_from_url cfg url = do
107 contents <- get_article_contents cfg url
108 case (xml_from_contents contents) of
109 Just html -> parse html
110 Nothing -> return Nothing
114 -- Should be called *after* preprocessing.
115 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
116 download_images xml = do
117 image_urls <- runX $ xml >>> image_srcs
118 download_image_urls image_urls
122 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
123 replace_remote_img_srcs image_map =
124 processTopDown (make_srcs_local `when` is_image)
126 -- old_src -> new_src
127 change_src_func :: String -> String
128 change_src_func old_src =
129 case Map.lookup old_src image_map of
130 -- Leave it alone if we don't have the file locally
134 change_src :: (ArrowXml a) => a XmlTree XmlTree
136 changeAttrValue change_src_func
138 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
140 processAttrl $ (change_src `when` (hasName "src"))
143 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
145 let clean_xml = xml >>> preprocess
146 image_map <- download_images clean_xml
147 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
148 appr <- ap_parse local_xml
149 fppr <- fp_parse local_xml
151 if (isNothing appr) then
158 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
159 parse_headline xml = do
160 let element_filter = xml >>> css "div.PageHeadline h1"
161 let element_text_filter = element_filter /> getText
162 element_text <- runX element_text_filter
165 [x] -> Just $ strip x
167 _ -> error "Found more than one headline."
170 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
171 parse_byline xml = do
172 let element_filter = xml >>> css "div.FeatureByLine"
173 let element_text_filter = element_filter /> getText
174 element_text <- runX element_text_filter
177 [x] -> Just $ strip x
179 _ -> error "Found more than one article byline."
185 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
187 arts <- ap_parse_articles xml
189 [x] -> return $ Just $ ArticlePage x
193 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
194 ap_parse_body xml = do
195 let element_filter = xml >>> css "div.ArticleText"
196 let element_html_filter = xshow element_filter
197 element_html <- runX element_html_filter
198 return $ case element_html of
201 _ -> error "Found more than one article."
204 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
205 ap_parse_articles xml = do
206 parsed_headline <- parse_headline xml
207 parsed_byline <- parse_byline xml
208 parsed_body <- ap_parse_body xml
210 putStrLn $ fromJust parsed_headline
212 if (isNothing parsed_headline) || (isNothing parsed_body)
215 let title' = Title $ fromJust parsed_headline
216 let byline' = Byline parsed_byline
217 let body' = BodyHtml $ fromJust parsed_body
219 return $ [Article title' byline' body']
227 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
229 hl <- parse_headline xml
230 parsed_articles <- fp_parse_articles xml
231 case parsed_articles of
233 x -> return $ Just $ FullPage (fromJust hl) x
237 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
238 fp_parse_article_title xml = do
239 let element_filter = xml >>> css "h2.SummaryHL"
240 let element_text_filter = element_filter //> getText
241 element_text <- runX element_text_filter
242 return $ case element_text of
243 [x] -> Just $ strip x
245 _ -> error "Found more than one article title."
250 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
251 fp_parse_article_body xml = do
252 -- First, delete the article title and byline.
253 let clean_xml' = xml >>> remove_title >>> remove_byline
254 -- The only child of the body element should be a div.lwn-article
255 -- since we wrapped the article's HTML in that.
256 let clean_xml = clean_xml' >>> css "body" >>> getChildren
257 clean_html <- runX . xshow $ clean_xml
258 return $ case clean_html of
261 _ -> error "Found more than one article body."
263 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
264 fp_parse_article xml = do
265 parsed_article_title <- fp_parse_article_title xml
266 parsed_article_byline <- parse_byline xml
267 parsed_article_body <- fp_parse_article_body xml
269 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
273 let title' = Title $ fromJust parsed_article_title
274 let byline' = Byline parsed_article_byline
275 let body' = BodyHtml $ fromJust parsed_article_body
276 return $ Just $ Article title' byline' body'
278 parse_html_article :: String -> IO (Maybe Article)
279 parse_html_article html = do
280 let xml = parseHtml $ wrap_in_body_div html
284 -- | In the full page, all of the article titles and bodies are
285 -- wrapped in one big div.ArticleText.
286 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
288 xml >>> css "div.ArticleText"
291 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
292 fp_parse_articles xml = do
293 bodies <- runX . xshow $ parse_bodies xml
294 let article_separator = "<h2 class=\"SummaryHL\">"
295 let split_articles'' = split article_separator (concat bodies)
296 -- The first element will contain the crap before the first <h2...>.
297 let split_articles' = case split_articles'' of
298 (_:_) -> tail split_articles''
300 -- Put the separator back, it was lost during the split.
301 let split_articles = map (article_separator ++) split_articles'
302 --_ <- mapM print_article split_articles
303 real_articles <- mapM parse_html_article split_articles
304 let just_articles = catMaybes real_articles
308 -- | This makes it easy to select otherwise-random chunks of html
310 wrap_in_body_div :: String -> String
312 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
318 -- Epublishable stuff
321 title :: Page -> String
322 title (ArticlePage a) = getTitle $ LWN.Article.title a
323 title (FullPage hl _) = hl
326 metadata :: Page -> IO String
328 date <- getCurrentTime
330 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
331 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
332 "<dc:language>en-US</dc:language>\n" ++
333 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
334 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
337 epublish :: Page -> Handle -> IO ()
338 epublish obj handle = do
339 let xhtml = to_xhtml obj
341 epub <- xhtml_to_epub epmd xhtml
346 xhtml_to_epub :: String -> String -> IO B.ByteString
348 write_epub . read_html
350 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
351 write_epub = writeEPUB Nothing [] my_writer_options
352 read_html = readHtml defaultParserState
359 test_preprocess_links :: Assertion
360 test_preprocess_links = do
361 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
362 let actual_xml = actual_xml' !! 0
364 expected_xml' <- runX $ expected_xml'' >>> css "body"
365 let expected_xml = expected_xml' !! 0
368 "Links are replaced with spans"
372 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
373 input_xml = parseHtml input_html
374 expected_html = "<body><span>Hello, world!</span></body>"
375 expected_xml'' = parseHtml expected_html
378 test_absolve_images :: Assertion
379 test_absolve_images = do
380 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
381 let actual_xml = actual_xml' !! 0
383 expected_xml' <- runX $ expected_xml'' >>> css "body"
384 let expected_xml = expected_xml' !! 0
387 "Image srcs are made absolute"
393 "<img src=\"/images/2012/example.jpg\" />" ++
395 input_xml = parseHtml input_html
398 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
400 expected_xml'' = parseHtml expected_html
403 test_comments_removed :: Assertion
404 test_comments_removed = do
405 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
406 let actual_xml = actual_xml' !! 0
408 expected_xml' <- runX $ expected_xml'' >>> css "body"
409 let expected_xml = expected_xml' !! 0
412 "Comment links are removed"
418 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
420 input_xml = parseHtml input_html
422 expected_html = "<body><p></p></body>"
423 expected_xml'' = parseHtml expected_html
429 testGroup "Page Tests" [
430 testCase "Links are replaced with spans" test_preprocess_links,
431 testCase "Image srcs are made absolute" test_absolve_images,
432 testCase "Comment links are removed" test_comments_removed ]