module LWN.XHTML ( XHTML, XML, full_story_urls, image_srcs, full_story_link, full_story_paragraph, is_image, parse_lwn, preprocess, remove_byline, remove_full_story_paragraphs, remove_title, to_xhtml, to_xml, xml_from_contents) where import Data.String.Utils (replace) import Text.HandsomeSoup (css) import Text.XML.HXT.Core ( (>>>), (/>), ArrowXml, IOStateArrow, SysConfigList, XmlTree, changeAttrValue, changeText, deep, getAttrValue, hasAttrValue, hasName, hasText, ifA, isElem, mkName, no, none, processAttrl, processTopDown, readString, setElemName, this, when, withParseHTML, withValidate, withWarnings, yes) import LWN.URI (URL, try_make_absolute_url) import Misc (contains) class XHTML a where to_xhtml :: a -> String class XML a where to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree) -- | Options used when parsing HTML. my_read_opts :: SysConfigList my_read_opts = [ withValidate no, withParseHTML yes, withWarnings no ] -- | My version of HandsomeSoup's parseHTML. parse_lwn :: String -> IOStateArrow s b XmlTree parse_lwn = readString my_read_opts -- | Takes the result of get_article_contents and calls parse_lwn on -- the contained value. xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree) xml_from_contents = fmap parse_lwn -- | Preprocessing common to both page types. preprocess :: (ArrowXml a) => a XmlTree XmlTree preprocess = make_image_srcs_absolute >>> remove_comment_links >>> replace_links_with_spans >>> replace_double_newline_in_pre is_link :: (ArrowXml a) => a XmlTree XmlTree is_link = isElem >>> hasName "a" is_pre :: (ArrowXml a) => a XmlTree XmlTree is_pre = isElem >>> hasName "pre" replace_double_newline_in_pre :: (ArrowXml a) => a XmlTree XmlTree replace_double_newline_in_pre = processTopDown $ fix_it `when` is_pre where fix_it :: (ArrowXml a) => a XmlTree XmlTree fix_it = changeText $ replace "\n\n" "

" remove_full_story_paragraphs :: (ArrowXml a) => a XmlTree XmlTree remove_full_story_paragraphs = processTopDown $ none `when` full_story_paragraph remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree remove_comment_links = processTopDown $ kill_comments `when` is_link where is_comment_link = hasAttrValue "href" (contains "#Comments") kill_comments = none `when` is_comment_link replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree replace_links_with_spans = processTopDown $ (make_span >>> remove_attrs) `when` is_link where make_span = setElemName $ mkName "span" remove_attrs = processAttrl none is_title :: (ArrowXml a) => a XmlTree XmlTree is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL")) is_byline :: (ArrowXml a) => a XmlTree XmlTree is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine")) is_image :: (ArrowXml a) => a XmlTree XmlTree is_image = isElem >>> hasName "img" remove_title :: (ArrowXml a) => a XmlTree XmlTree remove_title = processTopDown ((none) `when` is_title) remove_byline :: (ArrowXml a) => a XmlTree XmlTree remove_byline = processTopDown ((none) `when` is_byline) image_srcs :: (ArrowXml a) => a XmlTree URL image_srcs = css "img" >>> getAttrValue "src" full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree full_story_paragraph = isElem >>> hasName "p" >>> ifA (this /> full_story_link) this none -- Without regard to the parent paragraph. full_story_link :: (ArrowXml a) => a XmlTree XmlTree full_story_link = isElem >>> hasName "a" >>> ifA (this /> hasText (== "Full Story")) this none -- | Get the hrefs of all full story links. full_story_urls :: (ArrowXml a) => a XmlTree URL full_story_urls = deep $ full_story_paragraph /> full_story_link >>> getAttrValue "href" make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree make_image_srcs_absolute = processTopDown (make_srcs_absolute `when` is_image) where change_src :: (ArrowXml a) => a XmlTree XmlTree change_src = changeAttrValue try_make_absolute_url make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree make_srcs_absolute = processAttrl $ change_src `when` hasName "src"