18 import Text.HandsomeSoup (css)
19 import Text.XML.HXT.Core (
49 import LWN.URI (URL, try_make_absolute_url)
50 import Misc (contains)
54 to_xhtml :: a -> String
57 to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
59 -- | Options used when parsing HTML.
60 my_read_opts :: SysConfigList
61 my_read_opts = [ withValidate no,
65 -- | My version of HandsomeSoup's parseHTML.
66 parse_lwn :: String -> IOStateArrow s b XmlTree
67 parse_lwn = readString my_read_opts
70 -- | Takes the result of get_article_contents and calls parse_lwn on
71 -- the contained value.
72 xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
78 -- | Preprocessing common to both page types.
79 preprocess :: (ArrowXml a) => a XmlTree XmlTree
81 make_image_srcs_absolute
85 replace_links_with_spans
88 is_link :: (ArrowXml a) => a XmlTree XmlTree
90 isElem >>> hasName "a"
93 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
94 remove_comment_links =
95 processTopDown $ kill_comments `when` is_link
98 hasAttrValue "href" (contains "#Comments")
101 none `when` is_comment_link
103 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
104 replace_links_with_spans =
105 processTopDown $ (make_span >>> remove_attrs) `when` is_link
107 make_span = setElemName $ mkName "span"
108 remove_attrs = processAttrl none
111 is_title :: (ArrowXml a) => a XmlTree XmlTree
115 (hasAttrValue "class" (== "SummaryHL"))
118 is_byline :: (ArrowXml a) => a XmlTree XmlTree
122 (hasAttrValue "class" (== "FeatureByLine"))
125 is_image :: (ArrowXml a) => a XmlTree XmlTree
126 is_image = isElem >>> hasName "img"
128 remove_title :: (ArrowXml a) => a XmlTree XmlTree
130 processTopDown ((none) `when` is_title)
133 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
135 processTopDown ((none) `when` is_byline)
138 image_srcs :: (ArrowXml a) => a XmlTree URL
145 full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
146 full_story_paragraph =
152 (this /> full_story_link)
157 -- Without regard to the parent paragraph.
158 full_story_link :: (ArrowXml a) => a XmlTree XmlTree
165 (this /> hasText (== "Full Story"))
170 -- | Get the hrefs of all full story links.
171 full_story_urls :: (ArrowXml a) => a XmlTree URL
182 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
183 make_image_srcs_absolute =
184 processTopDown (make_srcs_absolute `when` is_image)
186 change_src :: (ArrowXml a) => a XmlTree XmlTree
188 changeAttrValue try_make_absolute_url
190 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
192 processAttrl $ change_src `when` hasName "src"