]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/XHTML.hs
Replace double newlines with "<br /><br />" in <pre> tags.
[dead/lwn-epub.git] / src / LWN / XHTML.hs
1 module LWN.XHTML (
2 XHTML,
3 XML,
4 full_story_urls,
5 image_srcs,
6 full_story_link,
7 full_story_paragraph,
8 is_image,
9 parse_lwn,
10 preprocess,
11 remove_byline,
12 remove_full_story_paragraphs,
13 remove_title,
14 to_xhtml,
15 to_xml,
16 xml_from_contents)
17 where
18
19 import Data.String.Utils (replace)
20 import Text.HandsomeSoup (css)
21 import Text.XML.HXT.Core (
22 (>>>),
23 (/>),
24 ArrowXml,
25 IOStateArrow,
26 SysConfigList,
27 XmlTree,
28 changeAttrValue,
29 changeText,
30 deep,
31 getAttrValue,
32 hasAttrValue,
33 hasName,
34 hasText,
35 ifA,
36 isElem,
37 mkName,
38 no,
39 none,
40 processAttrl,
41 processTopDown,
42 readString,
43 setElemName,
44 this,
45 when,
46 withParseHTML,
47 withValidate,
48 withWarnings,
49 yes)
50
51
52 import LWN.URI (URL, try_make_absolute_url)
53 import Misc (contains)
54
55
56 class XHTML a where
57 to_xhtml :: a -> String
58
59 class XML a where
60 to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
61
62 -- | Options used when parsing HTML.
63 my_read_opts :: SysConfigList
64 my_read_opts = [ withValidate no,
65 withParseHTML yes,
66 withWarnings no ]
67
68 -- | My version of HandsomeSoup's parseHTML.
69 parse_lwn :: String -> IOStateArrow s b XmlTree
70 parse_lwn = readString my_read_opts
71
72
73 -- | Takes the result of get_article_contents and calls parse_lwn on
74 -- the contained value.
75 xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
76 xml_from_contents =
77 fmap parse_lwn
78
79
80
81 -- | Preprocessing common to both page types.
82 preprocess :: (ArrowXml a) => a XmlTree XmlTree
83 preprocess =
84 make_image_srcs_absolute
85 >>>
86 remove_comment_links
87 >>>
88 replace_links_with_spans
89 >>>
90 replace_double_newline_in_pre
91
92
93 is_link :: (ArrowXml a) => a XmlTree XmlTree
94 is_link =
95 isElem >>> hasName "a"
96
97 is_pre :: (ArrowXml a) => a XmlTree XmlTree
98 is_pre =
99 isElem
100 >>>
101 hasName "pre"
102
103 replace_double_newline_in_pre :: (ArrowXml a) => a XmlTree XmlTree
104 replace_double_newline_in_pre =
105 processTopDown $ fix_it `when` is_pre
106 where
107 fix_it :: (ArrowXml a) => a XmlTree XmlTree
108 fix_it =
109 changeText $ replace "\n\n" "<br /><br />"
110
111 remove_full_story_paragraphs :: (ArrowXml a) => a XmlTree XmlTree
112 remove_full_story_paragraphs =
113 processTopDown $ none `when` full_story_paragraph
114
115
116 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
117 remove_comment_links =
118 processTopDown $ kill_comments `when` is_link
119 where
120 is_comment_link =
121 hasAttrValue "href" (contains "#Comments")
122
123 kill_comments =
124 none `when` is_comment_link
125
126 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
127 replace_links_with_spans =
128 processTopDown $ (make_span >>> remove_attrs) `when` is_link
129 where
130 make_span = setElemName $ mkName "span"
131 remove_attrs = processAttrl none
132
133
134 is_title :: (ArrowXml a) => a XmlTree XmlTree
135 is_title =
136 (hasName "h2")
137 >>>
138 (hasAttrValue "class" (== "SummaryHL"))
139
140
141 is_byline :: (ArrowXml a) => a XmlTree XmlTree
142 is_byline =
143 (hasName "div")
144 >>>
145 (hasAttrValue "class" (== "FeatureByLine"))
146
147
148 is_image :: (ArrowXml a) => a XmlTree XmlTree
149 is_image = isElem >>> hasName "img"
150
151 remove_title :: (ArrowXml a) => a XmlTree XmlTree
152 remove_title =
153 processTopDown ((none) `when` is_title)
154
155
156 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
157 remove_byline =
158 processTopDown ((none) `when` is_byline)
159
160
161 image_srcs :: (ArrowXml a) => a XmlTree URL
162 image_srcs =
163 css "img"
164 >>>
165 getAttrValue "src"
166
167
168 full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
169 full_story_paragraph =
170 isElem
171 >>>
172 hasName "p"
173 >>>
174 ifA
175 (this /> full_story_link)
176 this
177 none
178
179
180 -- Without regard to the parent paragraph.
181 full_story_link :: (ArrowXml a) => a XmlTree XmlTree
182 full_story_link =
183 isElem
184 >>>
185 hasName "a"
186 >>>
187 ifA
188 (this /> hasText (== "Full Story"))
189 this
190 none
191
192
193 -- | Get the hrefs of all full story links.
194 full_story_urls :: (ArrowXml a) => a XmlTree URL
195 full_story_urls =
196 deep $
197 full_story_paragraph
198 />
199 full_story_link
200 >>>
201 getAttrValue "href"
202
203
204
205 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
206 make_image_srcs_absolute =
207 processTopDown (make_srcs_absolute `when` is_image)
208 where
209 change_src :: (ArrowXml a) => a XmlTree XmlTree
210 change_src =
211 changeAttrValue try_make_absolute_url
212
213 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
214 make_srcs_absolute =
215 processAttrl $ change_src `when` hasName "src"