]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
Combine a few tests.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 module LWN.Page
2 where
3
4 import Text.Pandoc
5 import Data.Time (getCurrentTime)
6 import System.IO (Handle)
7 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
8 import Data.String.Utils (split, strip)
9 import Data.Maybe (catMaybes, fromJust, isNothing)
10 import Data.Tree.NTree.TypeDefs (NTree)
11 import Test.HUnit (Assertion, assertEqual)
12 import Test.Framework (Test, testGroup)
13 import Test.Framework.Providers.HUnit (testCase)
14 import Text.XML.HXT.Core (
15 ArrowXml,
16 IOSArrow,
17 XmlTree,
18 XNode,
19 (>>>),
20 (/>),
21 (//>),
22 getChildren,
23 getText,
24 hasAttrValue,
25 hasName,
26 mkName,
27 none,
28 processAttrl,
29 processTopDown,
30 runX,
31 setElemName,
32 xshow,
33 when
34 )
35 import Text.HandsomeSoup (css, parseHtml)
36
37 import LWN.Article
38 import XHTML
39
40 data Page =
41 -- | An LWN page with one article on it.
42 ArticlePage { article :: Article } |
43
44 -- | An LWN page with more than one article on it. These require
45 -- different parsing and display functions than the single-article
46 -- pages.
47 FullPage { headline :: String,
48 articles :: [Article] }
49
50
51 instance XHTML Page where
52 to_xhtml (ArticlePage a) =
53 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
54 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
55 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
56 "<head>" ++
57 " <meta http-equiv=\"Content-Type\"" ++
58 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
59 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
60 "</head>" ++
61 "<body>" ++
62 "<div>" ++
63 (to_xhtml a) ++
64 "</div>" ++
65 "</body>" ++
66 "</html>"
67
68 to_xhtml (FullPage hl as) =
69 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
70 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
71 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
72 "<head>" ++
73 " <meta http-equiv=\"Content-Type\"" ++
74 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
75 " <title>" ++ hl ++ "</title>" ++
76 "</head>" ++
77 "<body>" ++
78 "<div>" ++
79 "<h1>" ++ hl ++ "</h1>" ++
80 (concatMap to_xhtml as) ++
81 "</div>" ++
82 "</body>" ++
83 "</html>"
84
85
86
87 remove_images :: (ArrowXml a) => a XmlTree XmlTree
88 remove_images =
89 processTopDown ((none) `when` is_image)
90
91
92 is_link :: (ArrowXml a) => a XmlTree XmlTree
93 is_link =
94 hasName "a"
95
96 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
97 replace_links_with_spans =
98 processTopDown $ (make_span >>> remove_attrs) `when` is_link
99 where
100 make_span = setElemName $ mkName "span"
101 remove_attrs = processAttrl none
102
103 -- | Preprocessing common to both page types.
104 preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
105 preprocess xml =
106 xml >>> remove_images >>> replace_links_with_spans
107
108
109 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
110 parse xml = do
111 let clean_xml = preprocess xml
112 appr <- ap_parse clean_xml
113 fppr <- fp_parse clean_xml
114 return $
115 if (isNothing appr) then
116 fppr
117 else
118 appr
119
120
121
122 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
123 parse_headline xml = do
124 let element_filter = xml >>> css "div.PageHeadline h1"
125 let element_text_filter = element_filter /> getText
126 element_text <- runX element_text_filter
127 return $
128 case element_text of
129 [x] -> Just $ strip x
130 [] -> Nothing
131 _ -> error "Found more than one headline."
132
133
134 parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
135 parse_byline xml = do
136 let element_filter = xml >>> css "div.FeatureByLine"
137 let element_text_filter = element_filter /> getText
138 element_text <- runX element_text_filter
139 return $
140 case element_text of
141 [x] -> Just $ strip x
142 [] -> Nothing
143 _ -> error "Found more than one article byline."
144
145
146 --
147 -- ArticlePage Stuff
148 --
149 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
150 ap_parse xml = do
151 arts <- ap_parse_articles xml
152 case arts of
153 Just [x] -> return $ Just $ ArticlePage x
154 _ -> return Nothing
155
156
157 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
158 ap_parse_body xml = do
159 let element_filter = xml >>> css "div.ArticleText"
160 let element_html_filter = xshow element_filter
161 element_html <- runX element_html_filter
162 return $ case element_html of
163 [x] -> Just x
164 [] -> Nothing
165 _ -> error "Found more than one article."
166
167
168 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
169 ap_parse_articles xml = do
170 parsed_headline <- parse_headline xml
171 parsed_byline <- parse_byline xml
172 parsed_body <- ap_parse_body xml
173 let title' = Title (fromJust parsed_headline)
174 let byline' = Byline parsed_byline
175 let body' = BodyHtml (fromJust parsed_body)
176 return $ Just $ [Article title' byline' body']
177
178
179
180
181 --
182 -- FullPage Stuff
183 --
184
185 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
186 fp_parse xml = do
187 hl <- parse_headline xml
188 parsed_articles <- fp_parse_articles xml
189 case parsed_articles of
190 them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
191 _ -> return Nothing
192
193
194
195 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
196 fp_parse_article_title xml = do
197 let element_filter = xml >>> css "h2.SummaryHL"
198 let element_text_filter = element_filter //> getText
199 element_text <- runX element_text_filter
200 return $ case element_text of
201 [x] -> Just $ strip x
202 [] -> Nothing
203 _ -> error "Found more than one article title."
204
205
206
207 is_title :: (ArrowXml a) => a XmlTree XmlTree
208 is_title =
209 (hasName "h2")
210 >>>
211 (hasAttrValue "class" (== "SummaryHL"))
212
213
214 is_byline :: (ArrowXml a) => a XmlTree XmlTree
215 is_byline =
216 (hasName "div")
217 >>>
218 (hasAttrValue "class" (== "FeatureByLine"))
219
220
221 is_image :: (ArrowXml a) => a XmlTree XmlTree
222 is_image =
223 hasName "img"
224
225 remove_title :: (ArrowXml a) => a XmlTree XmlTree
226 remove_title =
227 processTopDown ((none) `when` is_title)
228
229
230 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
231 remove_byline =
232 processTopDown ((none) `when` is_byline)
233
234
235
236 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
237 fp_parse_article_body xml = do
238 -- First, delete the article title and byline.
239 let clean_xml' = xml >>> remove_title >>> remove_byline
240 -- The only child of the body element should be a div.lwn-article
241 -- since we wrapped the article's HTML in that.
242 let clean_xml = clean_xml' >>> css "body" >>> getChildren
243 clean_html <- runX . xshow $ clean_xml
244 return $ case clean_html of
245 [x] -> Just x
246 [] -> Nothing
247 _ -> error "Found more than one article body."
248
249 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
250 fp_parse_article xml = do
251 parsed_article_title <- fp_parse_article_title xml
252 parsed_article_byline <- parse_byline xml
253 parsed_article_body <- fp_parse_article_body xml
254 let title' = Title $ fromJust parsed_article_title
255 let byline' = Byline parsed_article_byline
256 let body' = BodyHtml $ fromJust parsed_article_body
257 return $ Just $ Article title' byline' body'
258
259 parse_html_article :: String -> IO (Maybe Article)
260 parse_html_article html = do
261 let xml = parseHtml $ wrap_in_body_div html
262 fp_parse_article xml
263
264
265 -- | In the full page, all of the article titles and bodies are
266 -- wrapped in one big div.ArticleText.
267 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
268 parse_bodies xml =
269 xml >>> css "div.ArticleText"
270
271
272 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
273 fp_parse_articles xml = do
274 bodies <- runX . xshow $ parse_bodies xml
275 let article_separator = "<h2 class=\"SummaryHL\">"
276 let split_articles'' = split article_separator (concat bodies)
277 -- The first element will contain the crap before the first <h2...>.
278 let split_articles' = tail split_articles''
279 -- Put the separator back, it was lost during the split.
280 let split_articles = map (article_separator ++) split_articles'
281 --_ <- mapM print_article split_articles
282 real_articles <- mapM parse_html_article split_articles
283 let just_articles = catMaybes real_articles
284 return just_articles
285
286
287 -- | This makes it easy to select otherwise-random chunks of html
288 -- using 'css'.
289 wrap_in_body_div :: String -> String
290 wrap_in_body_div s =
291 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
292
293
294
295
296 --
297 -- Epublishable stuff
298 --
299
300 title :: Page -> String
301 title (ArticlePage a) = getTitle $ LWN.Article.title a
302 title (FullPage hl _) = hl
303
304
305 metadata :: Page -> IO String
306 metadata obj = do
307 date <- getCurrentTime
308 return $
309 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
310 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
311 "<dc:language>en-US</dc:language>\n" ++
312 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
313 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
314
315
316 epublish :: Page -> Handle -> IO ()
317 epublish obj handle = do
318 let xhtml = to_xhtml obj
319 epmd <- metadata obj
320 epub <- xhtml_to_epub epmd xhtml
321 B.hPut handle epub
322
323
324 xhtml_to_epub :: String -> String -> IO B.ByteString
325 xhtml_to_epub epmd =
326 write_epub . read_html
327 where
328 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
329 write_epub = writeEPUB Nothing [] my_writer_options
330 read_html = readHtml defaultParserState
331
332
333
334
335
336 --
337 -- Tests
338 --
339
340 test_preprocess_links :: Assertion
341 test_preprocess_links = do
342 actual_xml' <- runX $ (preprocess input_xml) >>> css "body"
343 let actual_xml = actual_xml' !! 0
344
345 expected_xml' <- runX $ expected_xml'' >>> css "body"
346 let expected_xml = expected_xml' !! 0
347
348 assertEqual
349 "Links are replaced with spans"
350 expected_xml
351 actual_xml
352 where
353 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
354 input_xml = parseHtml input_html
355 expected_html = "<body><span>Hello, world!</span></body>"
356 expected_xml'' = parseHtml expected_html
357
358 page_tests :: Test
359 page_tests =
360 testGroup "Page Tests" [
361 testCase "Links are replaced with spans" test_preprocess_links ]