Go through a good bit of nonsense to get it successfully parsing our three test cases.
[dead/lwn-epub.git] / src / LWN / FullPage.hs
1 module LWN.FullPage
2 where
3
4 import Data.String.Utils (split, strip)
5 import Data.Maybe (catMaybes, fromJust)
6 import Data.Tree.NTree.TypeDefs (NTree)
7 import Text.XML.HXT.Core (
8 ArrowXml,
9 IOSArrow,
10 XmlTree,
11 XNode,
12 (>>>),
13 (/>),
14 (//>),
15 getChildren,
16 getText,
17 hasAttrValue,
18 hasName,
19 none,
20 processTopDown,
21 runX,
22 xshow,
23 when
24 )
25 import Text.HandsomeSoup (css, parseHtml)
26
27 import Epublishable
28 import LWN.Article
29 import XHTML
30
31 -- | An LWN page with more than one article on it. These require
32 -- different parsing and display functions than the single-article
33 -- pages.
34 data FullPage = FullPage { headline :: String,
35 articles :: [Article] }
36
37 articles_xhtml :: FullPage -> String
38 articles_xhtml fp = concatMap to_xhtml (articles fp)
39
40 instance XHTML FullPage where
41 to_xhtml fp =
42 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
43 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
44 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
45 "<head>" ++
46 " <meta http-equiv=\"Content-Type\"" ++
47 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
48 " <title>" ++ (headline fp) ++ "</title>" ++
49 "</head>" ++
50 "<body>" ++
51 "<div>" ++
52 "<h1>" ++ (headline fp) ++ "</h1>" ++
53 (articles_xhtml fp) ++
54 "</div>" ++
55 "</body>" ++
56 "</html>"
57
58 instance Epublishable FullPage where
59 parse xml = do
60 hl <- parse_headline xml
61 parsed_articles <- parse_articles xml
62 case parsed_articles of
63 them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
64 _ -> return Nothing
65
66 title = headline
67
68
69 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
70 parse_headline xml = do
71 let element_filter = xml >>> css "div.PageHeadline h1"
72 let element_text_filter = element_filter /> getText
73 element_text <- runX element_text_filter
74 return $ case element_text of
75 [x] -> Just $ strip x
76 [] -> Nothing
77 _ -> error "Found more than one headline."
78
79 parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
80 parse_article_byline xml = do
81 let element_filter = xml >>> css "div.FeatureByLine"
82 let element_text_filter = element_filter /> getText
83 element_text <- runX element_text_filter
84 return $ case element_text of
85 [x] -> Just $ strip x
86 [] -> Nothing
87 _ -> error "Found more than one article byline."
88
89
90 parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
91 parse_article_title xml = do
92 let element_filter = xml >>> css "h2.SummaryHL"
93 let element_text_filter = element_filter //> getText
94 element_text <- runX element_text_filter
95 return $ case element_text of
96 [x] -> Just $ strip x
97 [] -> Nothing
98 _ -> error "Found more than one article title."
99
100
101
102 is_title :: (ArrowXml a) => a XmlTree XmlTree
103 is_title =
104 (hasName "h2")
105 >>>
106 (hasAttrValue "class" (== "SummaryHL"))
107
108
109 is_byline :: (ArrowXml a) => a XmlTree XmlTree
110 is_byline =
111 (hasName "div")
112 >>>
113 (hasAttrValue "class" (== "FeatureByLine"))
114
115
116 is_image :: (ArrowXml a) => a XmlTree XmlTree
117 is_image =
118 hasName "img"
119
120
121 remove_title :: (ArrowXml a) => a XmlTree XmlTree
122 remove_title =
123 processTopDown ((none) `when` is_title)
124
125
126 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
127 remove_byline =
128 processTopDown ((none) `when` is_byline)
129
130
131 remove_images :: (ArrowXml a) => a XmlTree XmlTree
132 remove_images =
133 processTopDown ((none) `when` is_image)
134
135
136
137 parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
138 parse_article_body xml = do
139 -- First, delete the article title and byline.
140 let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images
141 -- The only child of the body element should be a div.lwn-article
142 -- since we wrapped the article's HTML in that.
143 let clean_xml = clean_xml' >>> css "body" >>> getChildren
144 clean_html <- runX . xshow $ clean_xml
145 return $ case clean_html of
146 [x] -> Just x
147 [] -> Nothing
148 _ -> error "Found more than one article body."
149
150 parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
151 parse_article xml = do
152 parsed_article_title <- parse_article_title xml
153 parsed_article_byline <- parse_article_byline xml
154 parsed_article_body <- parse_article_body xml
155 let title' = Title $ fromJust parsed_article_title
156 let byline' = Byline parsed_article_byline
157 let body' = BodyHtml $ fromJust parsed_article_body
158 return $ Just $ Article title' byline' body'
159
160 parse_html_article :: String -> IO (Maybe Article)
161 parse_html_article html = do
162 let xml = parseHtml $ wrap_in_body_div html
163 parse_article xml
164
165
166 -- | In the full page, all of the article titles and bodies are
167 -- wrapped in a div.ArticleText.
168 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
169 parse_bodies xml =
170 xml >>> css "div.ArticleText"
171
172
173 -- Debug, print a string.
174 print_article :: String -> IO ()
175 print_article s = do
176 putStrLn "-----------"
177 putStrLn "- Article -"
178 putStrLn "-----------"
179 putStrLn ""
180 putStrLn s
181 putStrLn ""
182
183
184 -- Debug, print an article's body html.
185 print_body :: Article -> IO ()
186 print_body x =
187 print_article bh
188 where
189 bh' = body_html x
190 bh = getBodyHtml bh'
191
192
193 parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
194 parse_articles xml = do
195 bodies <- runX . xshow $ parse_bodies xml
196 let article_separator = "<h2 class=\"SummaryHL\">"
197 let split_articles'' = split article_separator (concat bodies)
198 -- The first element will contain the crap before the first <h2...>.
199 let split_articles' = tail split_articles''
200 -- Put the separator back, it was lost during the split.
201 let split_articles = map (article_separator ++) split_articles'
202 --_ <- mapM print_article split_articles
203 real_articles <- mapM parse_html_article split_articles
204 let just_articles = catMaybes real_articles
205 return just_articles
206
207
208 wrap_in_body_div :: String -> String
209 wrap_in_body_div s =
210 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"