]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/XHTML.hs
5d3bbd17327a8eaf20bec29152ed30de9aa9c7df
[dead/lwn-epub.git] / src / LWN / XHTML.hs
1 module LWN.XHTML (
2 XHTML,
3 XML,
4 full_story_urls,
5 image_srcs,
6 full_story_link,
7 full_story_paragraph,
8 is_image,
9 parse_lwn,
10 preprocess,
11 remove_byline,
12 remove_title,
13 to_xhtml,
14 to_xml,
15 xml_from_contents)
16 where
17
18 import Text.HandsomeSoup (css)
19 import Text.XML.HXT.Core (
20 (>>>),
21 (/>),
22 ArrowXml,
23 IOStateArrow,
24 SysConfigList,
25 XmlTree,
26 changeAttrValue,
27 deep,
28 getAttrValue,
29 hasAttrValue,
30 hasName,
31 hasText,
32 ifA,
33 isElem,
34 mkName,
35 no,
36 none,
37 processAttrl,
38 processTopDown,
39 readString,
40 setElemName,
41 this,
42 when,
43 withParseHTML,
44 withValidate,
45 withWarnings,
46 yes)
47
48
49 import LWN.URI (URL, try_make_absolute_url)
50 import Misc (contains)
51
52
53 class XHTML a where
54 to_xhtml :: a -> String
55
56 class XML a where
57 to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
58
59 -- | Options used when parsing HTML.
60 my_read_opts :: SysConfigList
61 my_read_opts = [ withValidate no,
62 withParseHTML yes,
63 withWarnings no ]
64
65 -- | My version of HandsomeSoup's parseHTML.
66 parse_lwn :: String -> IOStateArrow s b XmlTree
67 parse_lwn = readString my_read_opts
68
69
70 -- | Takes the result of get_article_contents and calls parse_lwn on
71 -- the contained value.
72 xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
73 xml_from_contents =
74 fmap parse_lwn
75
76
77
78 -- | Preprocessing common to both page types.
79 preprocess :: (ArrowXml a) => a XmlTree XmlTree
80 preprocess =
81 make_image_srcs_absolute
82 >>>
83 remove_comment_links
84 >>>
85 replace_links_with_spans
86
87
88 is_link :: (ArrowXml a) => a XmlTree XmlTree
89 is_link =
90 isElem >>> hasName "a"
91
92
93 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
94 remove_comment_links =
95 processTopDown $ kill_comments `when` is_link
96 where
97 is_comment_link =
98 hasAttrValue "href" (contains "#Comments")
99
100 kill_comments =
101 none `when` is_comment_link
102
103 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
104 replace_links_with_spans =
105 processTopDown $ (make_span >>> remove_attrs) `when` is_link
106 where
107 make_span = setElemName $ mkName "span"
108 remove_attrs = processAttrl none
109
110
111 is_title :: (ArrowXml a) => a XmlTree XmlTree
112 is_title =
113 (hasName "h2")
114 >>>
115 (hasAttrValue "class" (== "SummaryHL"))
116
117
118 is_byline :: (ArrowXml a) => a XmlTree XmlTree
119 is_byline =
120 (hasName "div")
121 >>>
122 (hasAttrValue "class" (== "FeatureByLine"))
123
124
125 is_image :: (ArrowXml a) => a XmlTree XmlTree
126 is_image = isElem >>> hasName "img"
127
128 remove_title :: (ArrowXml a) => a XmlTree XmlTree
129 remove_title =
130 processTopDown ((none) `when` is_title)
131
132
133 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
134 remove_byline =
135 processTopDown ((none) `when` is_byline)
136
137
138 image_srcs :: (ArrowXml a) => a XmlTree URL
139 image_srcs =
140 css "img"
141 >>>
142 getAttrValue "src"
143
144
145 full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
146 full_story_paragraph =
147 isElem
148 >>>
149 hasName "p"
150 >>>
151 ifA
152 (this /> full_story_link)
153 this
154 none
155
156
157 -- Without regard to the parent paragraph.
158 full_story_link :: (ArrowXml a) => a XmlTree XmlTree
159 full_story_link =
160 isElem
161 >>>
162 hasName "a"
163 >>>
164 ifA
165 (this /> hasText (== "Full Story"))
166 this
167 none
168
169
170 -- | Get the hrefs of all full story links.
171 full_story_urls :: (ArrowXml a) => a XmlTree URL
172 full_story_urls =
173 deep $
174 full_story_paragraph
175 />
176 full_story_link
177 >>>
178 getAttrValue "href"
179
180
181
182 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
183 make_image_srcs_absolute =
184 processTopDown (make_srcs_absolute `when` is_image)
185 where
186 change_src :: (ArrowXml a) => a XmlTree XmlTree
187 change_src =
188 changeAttrValue try_make_absolute_url
189
190 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
191 make_srcs_absolute =
192 processAttrl $ change_src `when` hasName "src"