12 remove_full_story_paragraphs,
19 import Data.String.Utils (replace)
20 import Text.HandsomeSoup (css)
21 import Text.XML.HXT.Core (
52 import LWN.URI (URL, try_make_absolute_url)
53 import Misc (contains)
57 to_xhtml :: a -> String
60 to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
62 -- | Options used when parsing HTML.
63 my_read_opts :: SysConfigList
64 my_read_opts = [ withValidate no,
68 -- | My version of HandsomeSoup's parseHTML.
69 parse_lwn :: String -> IOStateArrow s b XmlTree
70 parse_lwn = readString my_read_opts
73 -- | Takes the result of get_article_contents and calls parse_lwn on
74 -- the contained value.
75 xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
81 -- | Preprocessing common to both page types.
82 preprocess :: (ArrowXml a) => a XmlTree XmlTree
84 make_image_srcs_absolute
88 replace_links_with_spans
90 replace_double_newline_in_pre
93 is_link :: (ArrowXml a) => a XmlTree XmlTree
95 isElem >>> hasName "a"
97 is_pre :: (ArrowXml a) => a XmlTree XmlTree
103 replace_double_newline_in_pre :: (ArrowXml a) => a XmlTree XmlTree
104 replace_double_newline_in_pre =
105 processTopDown $ fix_it `when` is_pre
107 fix_it :: (ArrowXml a) => a XmlTree XmlTree
109 changeText $ replace "\n\n" "<br /><br />"
111 remove_full_story_paragraphs :: (ArrowXml a) => a XmlTree XmlTree
112 remove_full_story_paragraphs =
113 processTopDown $ none `when` full_story_paragraph
116 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
117 remove_comment_links =
118 processTopDown $ kill_comments `when` is_link
121 hasAttrValue "href" (contains "#Comments")
124 none `when` is_comment_link
126 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
127 replace_links_with_spans =
128 processTopDown $ (make_span >>> remove_attrs) `when` is_link
130 make_span = setElemName $ mkName "span"
131 remove_attrs = processAttrl none
134 is_title :: (ArrowXml a) => a XmlTree XmlTree
138 (hasAttrValue "class" (== "SummaryHL"))
141 is_byline :: (ArrowXml a) => a XmlTree XmlTree
145 (hasAttrValue "class" (== "FeatureByLine"))
148 is_image :: (ArrowXml a) => a XmlTree XmlTree
149 is_image = isElem >>> hasName "img"
151 remove_title :: (ArrowXml a) => a XmlTree XmlTree
153 processTopDown ((none) `when` is_title)
156 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
158 processTopDown ((none) `when` is_byline)
161 image_srcs :: (ArrowXml a) => a XmlTree URL
168 full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
169 full_story_paragraph =
175 (this /> full_story_link)
180 -- Without regard to the parent paragraph.
181 full_story_link :: (ArrowXml a) => a XmlTree XmlTree
188 (this /> hasText (== "Full Story"))
193 -- | Get the hrefs of all full story links.
194 full_story_urls :: (ArrowXml a) => a XmlTree URL
205 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
206 make_image_srcs_absolute =
207 processTopDown (make_srcs_absolute `when` is_image)
209 change_src :: (ArrowXml a) => a XmlTree XmlTree
211 changeAttrValue try_make_absolute_url
213 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
215 processAttrl $ change_src `when` hasName "src"