module LWN.XHTML (
XHTML,
XML,
full_story_urls,
image_srcs,
full_story_link,
full_story_paragraph,
is_image,
parse_lwn,
preprocess,
remove_byline,
remove_full_story_paragraphs,
remove_title,
to_xhtml,
to_xml,
xml_from_contents)
where
import Data.String.Utils (replace)
import Text.HandsomeSoup (css)
import Text.XML.HXT.Core (
(>>>),
(/>),
ArrowXml,
IOStateArrow,
SysConfigList,
XmlTree,
changeAttrValue,
changeText,
deep,
getAttrValue,
hasAttrValue,
hasName,
hasText,
ifA,
isElem,
mkName,
no,
none,
processAttrl,
processTopDown,
readString,
setElemName,
this,
when,
withParseHTML,
withValidate,
withWarnings,
yes)
import LWN.URI (URL, try_make_absolute_url)
import Misc (contains)
class XHTML a where
to_xhtml :: a -> String
class XML a where
to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
-- | Options used when parsing HTML.
my_read_opts :: SysConfigList
my_read_opts = [ withValidate no,
withParseHTML yes,
withWarnings no ]
-- | My version of HandsomeSoup's parseHTML.
parse_lwn :: String -> IOStateArrow s b XmlTree
parse_lwn = readString my_read_opts
-- | Takes the result of get_article_contents and calls parse_lwn on
-- the contained value.
xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
xml_from_contents =
fmap parse_lwn
-- | Preprocessing common to both page types.
preprocess :: (ArrowXml a) => a XmlTree XmlTree
preprocess =
make_image_srcs_absolute
>>>
remove_comment_links
>>>
replace_links_with_spans
>>>
replace_double_newline_in_pre
is_link :: (ArrowXml a) => a XmlTree XmlTree
is_link =
isElem >>> hasName "a"
is_pre :: (ArrowXml a) => a XmlTree XmlTree
is_pre =
isElem
>>>
hasName "pre"
replace_double_newline_in_pre :: (ArrowXml a) => a XmlTree XmlTree
replace_double_newline_in_pre =
processTopDown $ fix_it `when` is_pre
where
fix_it :: (ArrowXml a) => a XmlTree XmlTree
fix_it =
changeText $ replace "\n\n" "
"
remove_full_story_paragraphs :: (ArrowXml a) => a XmlTree XmlTree
remove_full_story_paragraphs =
processTopDown $ none `when` full_story_paragraph
remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
remove_comment_links =
processTopDown $ kill_comments `when` is_link
where
is_comment_link =
hasAttrValue "href" (contains "#Comments")
kill_comments =
none `when` is_comment_link
replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
replace_links_with_spans =
processTopDown $ (make_span >>> remove_attrs) `when` is_link
where
make_span = setElemName $ mkName "span"
remove_attrs = processAttrl none
is_title :: (ArrowXml a) => a XmlTree XmlTree
is_title =
(hasName "h2")
>>>
(hasAttrValue "class" (== "SummaryHL"))
is_byline :: (ArrowXml a) => a XmlTree XmlTree
is_byline =
(hasName "div")
>>>
(hasAttrValue "class" (== "FeatureByLine"))
is_image :: (ArrowXml a) => a XmlTree XmlTree
is_image = isElem >>> hasName "img"
remove_title :: (ArrowXml a) => a XmlTree XmlTree
remove_title =
processTopDown ((none) `when` is_title)
remove_byline :: (ArrowXml a) => a XmlTree XmlTree
remove_byline =
processTopDown ((none) `when` is_byline)
image_srcs :: (ArrowXml a) => a XmlTree URL
image_srcs =
css "img"
>>>
getAttrValue "src"
full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
full_story_paragraph =
isElem
>>>
hasName "p"
>>>
ifA
(this /> full_story_link)
this
none
-- Without regard to the parent paragraph.
full_story_link :: (ArrowXml a) => a XmlTree XmlTree
full_story_link =
isElem
>>>
hasName "a"
>>>
ifA
(this /> hasText (== "Full Story"))
this
none
-- | Get the hrefs of all full story links.
full_story_urls :: (ArrowXml a) => a XmlTree URL
full_story_urls =
deep $
full_story_paragraph
/>
full_story_link
>>>
getAttrValue "href"
make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
make_image_srcs_absolute =
processTopDown (make_srcs_absolute `when` is_image)
where
change_src :: (ArrowXml a) => a XmlTree XmlTree
change_src =
changeAttrValue try_make_absolute_url
make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
make_srcs_absolute =
processAttrl $ change_src `when` hasName "src"