]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
f0ada7cd13cfff0707904272f93fa2b4b354f9e3
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import qualified Data.Map as Map (lookup)
7 import Data.Time (getCurrentTime)
8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
9 import Data.String.Utils (split, strip)
10 import qualified Data.Map as Map (Map, empty, insert)
11 import Data.Maybe (catMaybes, fromJust, isNothing)
12 import Prelude hiding (readFile)
13 import System.IO (Handle, hClose, hFlush)
14 import Test.HUnit (Assertion, assertEqual)
15 import Test.Framework (Test, testGroup)
16 import Test.Framework.Providers.HUnit (testCase)
17 import Text.Pandoc (
18 defaultParserState,
19 defaultWriterOptions,
20 readHtml,
21 writeEPUB,
22 writerEPUBMetadata)
23 import Text.XML.HXT.Core (
24 ArrowXml,
25 IOSArrow,
26 XmlTree,
27 ($<),
28 (>>>),
29 (/>),
30 (//>),
31 changeAttrValue,
32 getAttrValue,
33 getChildren,
34 getText,
35 hasName,
36 processAttrl,
37 processTopDown,
38 this,
39 runX,
40 xshow,
41 when)
42 import Text.HandsomeSoup (css, parseHtml)
43
44 import Configuration (Cfg)
45 import LWN.Article
46 import LWN.HTTP (
47 ImageMap,
48 download_image_urls,
49 get_article_contents)
50 import LWN.URI (URL)
51 import LWN.XHTML (
52 XHTML,
53 full_story_urls,
54 image_srcs,
55 full_story_link,
56 full_story_paragraph,
57 is_image,
58 preprocess,
59 remove_byline,
60 remove_title,
61 to_xhtml,
62 to_xml,
63 xml_from_contents)
64
65
66
67
68 data Page =
69 -- | An LWN page with one article on it.
70 ArticlePage { article :: Article } |
71
72 -- | An LWN page with more than one article on it. These require
73 -- different parsing and display functions than the single-article
74 -- pages.
75 FullPage { headline :: String,
76 articles :: [Article] }
77
78
79 instance XHTML Page where
80 to_xhtml (ArticlePage a) =
81 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
82 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
83 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
84 "<head>" ++
85 " <meta http-equiv=\"Content-Type\"" ++
86 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
87 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
88 "</head>" ++
89 "<body>" ++
90 (to_xhtml a) ++
91 "</body>" ++
92 "</html>"
93
94 to_xhtml (FullPage hl as) =
95 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
96 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
97 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
98 "<head>" ++
99 " <meta http-equiv=\"Content-Type\"" ++
100 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
101 " <title>" ++ hl ++ "</title>" ++
102 "</head>" ++
103 "<body>" ++
104 "<div>" ++
105 "<h1>" ++ hl ++ "</h1>" ++
106 (concatMap to_xhtml as) ++
107 "</div>" ++
108 "</body>" ++
109 "</html>"
110
111
112
113 page_from_url :: Cfg -> URL -> IO (Maybe Page)
114 page_from_url cfg url = do
115 contents <- get_article_contents cfg url
116 case (xml_from_contents contents) of
117 Just html -> parse cfg html
118 Nothing -> return Nothing
119
120
121
122 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
123 insert_full_stories story_map =
124 processTopDown (article_xml `when` full_story_paragraph)
125 where
126 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
127 lookup_func href =
128 case Map.lookup href story_map of
129 -- Leave it alone if we don't have the full story.
130 Nothing -> this
131 Just v -> to_xml v
132
133 article_xml :: (ArrowXml a) => a XmlTree XmlTree
134 article_xml =
135 lookup_func
136 $<
137 (this /> full_story_link >>> getAttrValue "href")
138
139 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
140 replace_remote_img_srcs image_map =
141 processTopDown (make_srcs_local `when` is_image)
142 where
143 -- old_src -> new_src
144 change_src_func :: String -> String
145 change_src_func old_src =
146 case Map.lookup old_src image_map of
147 -- Leave it alone if we don't have the file locally
148 Nothing -> old_src
149 Just v -> v
150
151 change_src :: (ArrowXml a) => a XmlTree XmlTree
152 change_src =
153 changeAttrValue change_src_func
154
155 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
156 make_srcs_local =
157 processAttrl $ (change_src `when` (hasName "src"))
158
159
160
161
162 -- Should be called *after* preprocessing.
163 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
164 download_images xml = do
165 image_urls <- runX $ xml >>> image_srcs
166 download_image_urls image_urls
167
168
169
170 type StoryMap = Map.Map URL Article
171
172 -- These come *before* preprocessing.
173 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
174 download_full_story_urls cfg story_urls = do
175 pages <- mapM (page_from_url cfg) story_urls
176 let pairs = zip story_urls pages
177 return $ foldl my_insert empty_map pairs
178 where
179 empty_map = Map.empty :: StoryMap
180
181 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
182 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
183 my_insert dict (_, _) = dict
184
185
186 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
187 download_full_stories cfg xml = do
188 story_urls <- runX $ xml >>> full_story_urls
189 download_full_story_urls cfg story_urls
190
191
192 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
193 parse cfg xml = do
194 story_map <- download_full_stories cfg xml
195 let fs_xml = xml >>> insert_full_stories story_map
196
197 let clean_xml = fs_xml >>> preprocess
198 image_map <- download_images clean_xml
199 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
200
201 appr <- ap_parse local_xml
202 fppr <- fp_parse local_xml
203 return $
204 if (isNothing appr) then
205 fppr
206 else
207 appr
208
209
210
211 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
212 parse_headline xml = do
213 let element_filter = xml >>> css "div.PageHeadline h1"
214 let element_text_filter = element_filter /> getText
215 element_text <- runX element_text_filter
216 return $
217 case element_text of
218 [x] -> Just $ strip x
219 [] -> Nothing
220 _ -> error "Found more than one headline."
221
222
223 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
224 parse_byline xml = do
225 let element_filter = xml >>> css "div.FeatureByLine"
226 let element_text_filter = element_filter /> getText
227 element_text <- runX element_text_filter
228 return $
229 case element_text of
230 [x] -> Just $ strip x
231 [] -> Nothing
232 _ -> error "Found more than one article byline."
233
234
235 --
236 -- ArticlePage Stuff
237 --
238 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
239 ap_parse xml = do
240 arts <- ap_parse_articles xml
241 case arts of
242 [x] -> return $ Just $ ArticlePage x
243 _ -> return Nothing
244
245
246 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
247 ap_parse_body xml = do
248 let element_filter = xml >>> css "div.ArticleText"
249 let element_html_filter = xshow element_filter
250 element_html <- runX element_html_filter
251 return $ case element_html of
252 [x] -> Just x
253 [] -> Nothing
254 _ -> error "Found more than one article."
255
256
257 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
258 ap_parse_articles xml = do
259 parsed_headline <- parse_headline xml
260 parsed_byline <- parse_byline xml
261 parsed_body <- ap_parse_body xml
262
263 putStrLn $ fromJust parsed_headline
264
265 if (isNothing parsed_headline) || (isNothing parsed_body)
266 then return []
267 else do
268 let title' = Title $ fromJust parsed_headline
269 let byline' = Byline parsed_byline
270 let body' = BodyHtml $ fromJust parsed_body
271
272 return $ [Article title' byline' body']
273
274
275
276 --
277 -- FullPage Stuff
278 --
279
280 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
281 fp_parse xml = do
282 hl <- parse_headline xml
283 parsed_articles <- fp_parse_articles xml
284 case parsed_articles of
285 [] -> return Nothing
286 x -> return $ Just $ FullPage (fromJust hl) x
287
288
289
290 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
291 fp_parse_article_title xml = do
292 let element_filter = xml >>> css "h2.SummaryHL"
293 let element_text_filter = element_filter //> getText
294 element_text <- runX element_text_filter
295 return $ case element_text of
296 [x] -> Just $ strip x
297 [] -> Nothing
298 _ -> error "Found more than one article title."
299
300
301
302
303 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
304 fp_parse_article_body xml = do
305 -- First, delete the article title and byline.
306 let clean_xml' = xml >>> remove_title >>> remove_byline
307 -- The only child of the body element should be a div.lwn-article
308 -- since we wrapped the article's HTML in that.
309 let clean_xml = clean_xml' >>> css "body" >>> getChildren
310 clean_html <- runX . xshow $ clean_xml
311 return $ case clean_html of
312 [x] -> Just x
313 [] -> Nothing
314 _ -> error "Found more than one article body."
315
316 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
317 fp_parse_article xml = do
318 parsed_article_title <- fp_parse_article_title xml
319 parsed_article_byline <- parse_byline xml
320 parsed_article_body <- fp_parse_article_body xml
321
322 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
323 then
324 return Nothing
325 else do
326 let title' = Title $ fromJust parsed_article_title
327 let byline' = Byline parsed_article_byline
328 let body' = BodyHtml $ fromJust parsed_article_body
329 return $ Just $ Article title' byline' body'
330
331 parse_html_article :: String -> IO (Maybe Article)
332 parse_html_article html = do
333 let xml = parseHtml $ wrap_in_body_div html
334 fp_parse_article xml
335
336
337 -- | In the full page, all of the article titles and bodies are
338 -- wrapped in one big div.ArticleText.
339 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
340 parse_bodies xml =
341 xml >>> css "div.ArticleText"
342
343
344 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
345 fp_parse_articles xml = do
346 bodies <- runX . xshow $ parse_bodies xml
347 let article_separator = "<h2 class=\"SummaryHL\">"
348 let split_articles'' = split article_separator (concat bodies)
349 -- The first element will contain the crap before the first <h2...>.
350 let split_articles' = case split_articles'' of
351 (_:_) -> tail split_articles''
352 [] -> []
353 -- Put the separator back, it was lost during the split.
354 let split_articles = map (article_separator ++) split_articles'
355 --_ <- mapM print_article split_articles
356 real_articles <- mapM parse_html_article split_articles
357 let just_articles = catMaybes real_articles
358 return just_articles
359
360
361 -- | This makes it easy to select otherwise-random chunks of html
362 -- using 'css'.
363 wrap_in_body_div :: String -> String
364 wrap_in_body_div s =
365 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
366
367
368
369
370 --
371 -- Epublishable stuff
372 --
373
374 title :: Page -> String
375 title (ArticlePage a) = getTitle $ LWN.Article.title a
376 title (FullPage hl _) = hl
377
378
379 metadata :: Page -> IO String
380 metadata obj = do
381 date <- getCurrentTime
382 return $
383 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
384 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
385 "<dc:language>en-US</dc:language>\n" ++
386 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
387 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
388
389
390 epublish :: Page -> Handle -> IO ()
391 epublish obj handle = do
392 let xhtml = to_xhtml obj
393 epmd <- metadata obj
394 epub <- xhtml_to_epub epmd xhtml
395 B.hPut handle epub
396 hFlush handle
397 hClose handle
398
399 xhtml_to_epub :: String -> String -> IO B.ByteString
400 xhtml_to_epub epmd =
401 write_epub . read_html
402 where
403 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
404 write_epub = writeEPUB Nothing [] my_writer_options
405 read_html = readHtml defaultParserState
406
407
408 --
409 -- Tests
410 --
411
412 test_preprocess_links :: Assertion
413 test_preprocess_links = do
414 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
415 let actual_xml = actual_xml' !! 0
416
417 expected_xml' <- runX $ expected_xml'' >>> css "body"
418 let expected_xml = expected_xml' !! 0
419
420 assertEqual
421 "Links are replaced with spans"
422 expected_xml
423 actual_xml
424 where
425 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
426 input_xml = parseHtml input_html
427 expected_html = "<body><span>Hello, world!</span></body>"
428 expected_xml'' = parseHtml expected_html
429
430
431 test_absolve_images :: Assertion
432 test_absolve_images = do
433 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
434 let actual_xml = actual_xml' !! 0
435
436 expected_xml' <- runX $ expected_xml'' >>> css "body"
437 let expected_xml = expected_xml' !! 0
438
439 assertEqual
440 "Image srcs are made absolute"
441 expected_xml
442 actual_xml
443 where
444 input_html =
445 "<body>" ++
446 "<img src=\"/images/2012/example.jpg\" />" ++
447 "</body>"
448 input_xml = parseHtml input_html
449 expected_html =
450 "<body>" ++
451 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
452 "</body>"
453 expected_xml'' = parseHtml expected_html
454
455
456 test_comments_removed :: Assertion
457 test_comments_removed = do
458 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
459 let actual_xml = actual_xml' !! 0
460
461 expected_xml' <- runX $ expected_xml'' >>> css "body"
462 let expected_xml = expected_xml' !! 0
463
464 assertEqual
465 "Comment links are removed"
466 expected_xml
467 actual_xml
468 where
469 input_html =
470 "<body><p>" ++
471 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
472 "</p></body>"
473 input_xml = parseHtml input_html
474
475 expected_html = "<body><p></p></body>"
476 expected_xml'' = parseHtml expected_html
477
478
479 test_full_story_urls_parsed :: Assertion
480 test_full_story_urls_parsed = do
481 actual <- runX $ actual'
482
483 assertEqual
484 "Full Story URLs are parsed"
485 expected
486 actual
487 where
488 expected = ["/Articles/500738/", "/Articles/501837/"]
489
490 full_story_html =
491 concat ["<p>",
492 "<a href=\"/Articles/500738/\">Full Story</a> ",
493 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
494 "<p>",
495 "<a href=\"/Articles/501837/\">Full Story</a> ",
496 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
497 "<p>"]
498
499 xml = parseHtml full_story_html
500 actual' = xml >>> full_story_urls
501
502 page_tests :: Test
503 page_tests =
504 testGroup "Page Tests" [
505 testCase "Links are replaced with spans" test_preprocess_links,
506 testCase "Image srcs are made absolute" test_absolve_images,
507 testCase "Comment links are removed" test_comments_removed,
508 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]