]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
Add a --full-stories option which may or may not work.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.Map as Map (lookup)
8 import Data.Time (getCurrentTime)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import qualified Data.Map as Map (Map, empty, insert)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Prelude hiding (readFile)
14 import System.IO (Handle, hClose, hFlush)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
18 import Text.Pandoc (
19 defaultParserState,
20 defaultWriterOptions,
21 readHtml,
22 writeEPUB,
23 writerEPUBMetadata)
24 import Text.XML.HXT.Core (
25 ArrowXml,
26 IOSArrow,
27 XmlTree,
28 ($<),
29 (>>>),
30 (/>),
31 (//>),
32 changeAttrValue,
33 getAttrValue,
34 getChildren,
35 getText,
36 hasName,
37 processAttrl,
38 processTopDown,
39 this,
40 runX,
41 xshow,
42 when)
43 import Text.HandsomeSoup (css, parseHtml)
44
45 import Configuration (Cfg, full_stories)
46 import LWN.Article
47 import LWN.HTTP (
48 ImageMap,
49 download_image_urls,
50 get_article_contents)
51 import LWN.URI (URL)
52 import LWN.XHTML (
53 XHTML,
54 full_story_urls,
55 image_srcs,
56 full_story_link,
57 full_story_paragraph,
58 is_image,
59 preprocess,
60 remove_byline,
61 remove_title,
62 to_xhtml,
63 to_xml,
64 xml_from_contents)
65
66
67
68
69 data Page =
70 -- | An LWN page with one article on it.
71 ArticlePage { article :: Article } |
72
73 -- | An LWN page with more than one article on it. These require
74 -- different parsing and display functions than the single-article
75 -- pages.
76 FullPage { headline :: String,
77 articles :: [Article] }
78
79
80 instance XHTML Page where
81 to_xhtml (ArticlePage a) =
82 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
83 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
84 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
85 "<head>" ++
86 " <meta http-equiv=\"Content-Type\"" ++
87 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
88 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
89 "</head>" ++
90 "<body>" ++
91 (to_xhtml a) ++
92 "</body>" ++
93 "</html>"
94
95 to_xhtml (FullPage hl as) =
96 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
97 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
98 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
99 "<head>" ++
100 " <meta http-equiv=\"Content-Type\"" ++
101 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
102 " <title>" ++ hl ++ "</title>" ++
103 "</head>" ++
104 "<body>" ++
105 "<div>" ++
106 "<h1>" ++ hl ++ "</h1>" ++
107 (concatMap to_xhtml as) ++
108 "</div>" ++
109 "</body>" ++
110 "</html>"
111
112
113
114 page_from_url :: Cfg -> URL -> IO (Maybe Page)
115 page_from_url cfg url = do
116 contents <- get_article_contents cfg url
117 case (xml_from_contents contents) of
118 Just html -> parse cfg html
119 Nothing -> return Nothing
120
121
122
123 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
124 insert_full_stories story_map =
125 processTopDown (article_xml `when` full_story_paragraph)
126 where
127 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
128 lookup_func href =
129 case Map.lookup href story_map of
130 -- Leave it alone if we don't have the full story.
131 Nothing -> this
132 Just v -> to_xml v
133
134 article_xml :: (ArrowXml a) => a XmlTree XmlTree
135 article_xml =
136 lookup_func
137 $<
138 (this /> full_story_link >>> getAttrValue "href")
139
140 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
141 replace_remote_img_srcs image_map =
142 processTopDown (make_srcs_local `when` is_image)
143 where
144 -- old_src -> new_src
145 change_src_func :: String -> String
146 change_src_func old_src =
147 case Map.lookup old_src image_map of
148 -- Leave it alone if we don't have the file locally
149 Nothing -> old_src
150 Just v -> v
151
152 change_src :: (ArrowXml a) => a XmlTree XmlTree
153 change_src =
154 changeAttrValue change_src_func
155
156 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
157 make_srcs_local =
158 processAttrl $ (change_src `when` (hasName "src"))
159
160
161
162
163 -- Should be called *after* preprocessing.
164 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
165 download_images xml = do
166 image_urls <- runX $ xml >>> image_srcs
167 download_image_urls image_urls
168
169
170
171 type StoryMap = Map.Map URL Article
172
173 -- These come *before* preprocessing.
174 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
175 download_full_story_urls cfg story_urls = do
176 pages <- parallel $ map (page_from_url cfg) story_urls
177 let pairs = zip story_urls pages
178 return $ foldl my_insert empty_map pairs
179 where
180 empty_map = Map.empty :: StoryMap
181
182 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
183 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
184 my_insert dict (_, _) = dict
185
186
187 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
188 download_full_stories cfg xml = do
189 story_urls <- runX $ xml >>> full_story_urls
190 download_full_story_urls cfg story_urls
191
192
193 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
194 parse cfg xml = do
195 story_map <- download_full_stories cfg xml
196 let fs_xml = if (full_stories cfg) then
197 xml >>> insert_full_stories story_map
198 else
199 xml
200
201 let clean_xml = fs_xml >>> preprocess
202 image_map <- download_images clean_xml
203 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
204
205 appr <- ap_parse local_xml
206 fppr <- fp_parse local_xml
207 return $
208 if (isNothing appr) then
209 fppr
210 else
211 appr
212
213
214
215 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
216 parse_headline xml = do
217 let element_filter = xml >>> css "div.PageHeadline h1"
218 let element_text_filter = element_filter /> getText
219 element_text <- runX element_text_filter
220 return $
221 case element_text of
222 [x] -> Just $ strip x
223 [] -> Nothing
224 _ -> error "Found more than one headline."
225
226
227 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
228 parse_byline xml = do
229 let element_filter = xml >>> css "div.FeatureByLine"
230 let element_text_filter = element_filter /> getText
231 element_text <- runX element_text_filter
232 return $
233 case element_text of
234 [x] -> Just $ strip x
235 [] -> Nothing
236 _ -> error "Found more than one article byline."
237
238
239 --
240 -- ArticlePage Stuff
241 --
242 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
243 ap_parse xml = do
244 arts <- ap_parse_articles xml
245 case arts of
246 [x] -> return $ Just $ ArticlePage x
247 _ -> return Nothing
248
249
250 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
251 ap_parse_body xml = do
252 let element_filter = xml >>> css "div.ArticleText"
253 let element_html_filter = xshow element_filter
254 element_html <- runX element_html_filter
255 return $ case element_html of
256 [x] -> Just x
257 [] -> Nothing
258 _ -> error "Found more than one article."
259
260
261 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
262 ap_parse_articles xml = do
263 parsed_headline <- parse_headline xml
264 parsed_byline <- parse_byline xml
265 parsed_body <- ap_parse_body xml
266
267 if (isNothing parsed_headline) || (isNothing parsed_body)
268 then return []
269 else do
270 let title' = Title $ fromJust parsed_headline
271 let byline' = Byline parsed_byline
272 let body' = BodyHtml $ fromJust parsed_body
273
274 return $ [Article title' byline' body']
275
276
277
278 --
279 -- FullPage Stuff
280 --
281
282 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
283 fp_parse xml = do
284 hl <- parse_headline xml
285 parsed_articles <- fp_parse_articles xml
286 case parsed_articles of
287 [] -> return Nothing
288 x -> return $ Just $ FullPage (fromJust hl) x
289
290
291
292 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
293 fp_parse_article_title xml = do
294 let element_filter = xml >>> css "h2.SummaryHL"
295 let element_text_filter = element_filter //> getText
296 element_text <- runX element_text_filter
297 return $ case element_text of
298 [x] -> Just $ strip x
299 [] -> Nothing
300 _ -> error "Found more than one article title."
301
302
303
304
305 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
306 fp_parse_article_body xml = do
307 -- First, delete the article title and byline.
308 let clean_xml' = xml >>> remove_title >>> remove_byline
309 -- The only child of the body element should be a div.lwn-article
310 -- since we wrapped the article's HTML in that.
311 let clean_xml = clean_xml' >>> css "body" >>> getChildren
312 clean_html <- runX . xshow $ clean_xml
313 return $ case clean_html of
314 [x] -> Just x
315 [] -> Nothing
316 _ -> error "Found more than one article body."
317
318 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
319 fp_parse_article xml = do
320 parsed_article_title <- fp_parse_article_title xml
321 parsed_article_byline <- parse_byline xml
322 parsed_article_body <- fp_parse_article_body xml
323
324 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
325 then
326 return Nothing
327 else do
328 let title' = Title $ fromJust parsed_article_title
329 let byline' = Byline parsed_article_byline
330 let body' = BodyHtml $ fromJust parsed_article_body
331 return $ Just $ Article title' byline' body'
332
333 parse_html_article :: String -> IO (Maybe Article)
334 parse_html_article html = do
335 let xml = parseHtml $ wrap_in_body_div html
336 fp_parse_article xml
337
338
339 -- | In the full page, all of the article titles and bodies are
340 -- wrapped in one big div.ArticleText.
341 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
342 parse_bodies xml =
343 xml >>> css "div.ArticleText"
344
345
346 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
347 fp_parse_articles xml = do
348 bodies <- runX . xshow $ parse_bodies xml
349 let article_separator = "<h2 class=\"SummaryHL\">"
350 let split_articles'' = split article_separator (concat bodies)
351 -- The first element will contain the crap before the first <h2...>.
352 let split_articles' = case split_articles'' of
353 (_:_) -> tail split_articles''
354 [] -> []
355 -- Put the separator back, it was lost during the split.
356 let split_articles = map (article_separator ++) split_articles'
357 real_articles <- mapM parse_html_article split_articles
358 let just_articles = catMaybes real_articles
359 return just_articles
360
361
362 -- | This makes it easy to select otherwise-random chunks of html
363 -- using 'css'.
364 wrap_in_body_div :: String -> String
365 wrap_in_body_div s =
366 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
367
368
369
370
371 --
372 -- Epublishable stuff
373 --
374
375 title :: Page -> String
376 title (ArticlePage a) = getTitle $ LWN.Article.title a
377 title (FullPage hl _) = hl
378
379
380 metadata :: Page -> IO String
381 metadata obj = do
382 date <- getCurrentTime
383 return $
384 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
385 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
386 "<dc:language>en-US</dc:language>\n" ++
387 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
388 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
389
390
391 epublish :: Page -> Handle -> IO ()
392 epublish obj handle = do
393 let xhtml = to_xhtml obj
394 epmd <- metadata obj
395 epub <- xhtml_to_epub epmd xhtml
396 B.hPut handle epub
397 hFlush handle
398 hClose handle
399
400 xhtml_to_epub :: String -> String -> IO B.ByteString
401 xhtml_to_epub epmd =
402 write_epub . read_html
403 where
404 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
405 write_epub = writeEPUB Nothing [] my_writer_options
406 read_html = readHtml defaultParserState
407
408
409 --
410 -- Tests
411 --
412
413 test_preprocess_links :: Assertion
414 test_preprocess_links = do
415 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
416 let actual_xml = actual_xml' !! 0
417
418 expected_xml' <- runX $ expected_xml'' >>> css "body"
419 let expected_xml = expected_xml' !! 0
420
421 assertEqual
422 "Links are replaced with spans"
423 expected_xml
424 actual_xml
425 where
426 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
427 input_xml = parseHtml input_html
428 expected_html = "<body><span>Hello, world!</span></body>"
429 expected_xml'' = parseHtml expected_html
430
431
432 test_absolve_images :: Assertion
433 test_absolve_images = do
434 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
435 let actual_xml = actual_xml' !! 0
436
437 expected_xml' <- runX $ expected_xml'' >>> css "body"
438 let expected_xml = expected_xml' !! 0
439
440 assertEqual
441 "Image srcs are made absolute"
442 expected_xml
443 actual_xml
444 where
445 input_html =
446 "<body>" ++
447 "<img src=\"/images/2012/example.jpg\" />" ++
448 "</body>"
449 input_xml = parseHtml input_html
450 expected_html =
451 "<body>" ++
452 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
453 "</body>"
454 expected_xml'' = parseHtml expected_html
455
456
457 test_comments_removed :: Assertion
458 test_comments_removed = do
459 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
460 let actual_xml = actual_xml' !! 0
461
462 expected_xml' <- runX $ expected_xml'' >>> css "body"
463 let expected_xml = expected_xml' !! 0
464
465 assertEqual
466 "Comment links are removed"
467 expected_xml
468 actual_xml
469 where
470 input_html =
471 "<body><p>" ++
472 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
473 "</p></body>"
474 input_xml = parseHtml input_html
475
476 expected_html = "<body><p></p></body>"
477 expected_xml'' = parseHtml expected_html
478
479
480 test_full_story_urls_parsed :: Assertion
481 test_full_story_urls_parsed = do
482 actual <- runX $ actual'
483
484 assertEqual
485 "Full Story URLs are parsed"
486 expected
487 actual
488 where
489 expected = ["/Articles/500738/", "/Articles/501837/"]
490
491 full_story_html =
492 concat ["<p>",
493 "<a href=\"/Articles/500738/\">Full Story</a> ",
494 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
495 "<p>",
496 "<a href=\"/Articles/501837/\">Full Story</a> ",
497 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
498 "<p>"]
499
500 xml = parseHtml full_story_html
501 actual' = xml >>> full_story_urls
502
503 page_tests :: Test
504 page_tests =
505 testGroup "Page Tests" [
506 testCase "Links are replaced with spans" test_preprocess_links,
507 testCase "Image srcs are made absolute" test_absolve_images,
508 testCase "Comment links are removed" test_comments_removed,
509 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]