X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FXHTML.hs;h=8dfe3b2e85645c451b601a58af96107f571e589b;hp=f6632db19fe902afd69a8ea3017adeeba4b43655;hb=aad40cd8e1e8c84c5fc294674a7159bb40838440;hpb=2953924e2016393a1ffb9e2e82b4c90c8c57dfd3 diff --git a/src/LWN/XHTML.hs b/src/LWN/XHTML.hs index f6632db..8dfe3b2 100644 --- a/src/LWN/XHTML.hs +++ b/src/LWN/XHTML.hs @@ -1,25 +1,45 @@ module LWN.XHTML ( XHTML, XML, - my_read_opts, + image_srcs, + is_image, parse_lwn, + preprocess, + remove_byline, + remove_title, to_xhtml, - to_xml - ) + to_xml, + xml_from_contents) where +import Text.HandsomeSoup (css) import Text.XML.HXT.Core ( + (>>>), ArrowXml, IOStateArrow, SysConfigList, XmlTree, + changeAttrValue, + getAttrValue, + hasAttrValue, + hasName, + isElem, + mkName, no, + none, + processAttrl, + processTopDown, readString, + setElemName, + when, withParseHTML, withValidate, withWarnings, - yes - ) + yes) + +import LWN.URI (URL, try_make_absolute_url) +import Misc (contains) + class XHTML a where to_xhtml :: a -> String @@ -36,3 +56,90 @@ my_read_opts = [ withValidate no, -- | My version of HandsomeSoup's parseHTML. parse_lwn :: String -> IOStateArrow s b XmlTree parse_lwn = readString my_read_opts + + +-- | Takes the result of get_article_contents and calls parse_lwn on +-- the contained value. +xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree) +xml_from_contents = + fmap parse_lwn + + + +-- | Preprocessing common to both page types. +preprocess :: (ArrowXml a) => a XmlTree XmlTree +preprocess = + make_image_srcs_absolute + >>> + remove_comment_links + >>> + replace_links_with_spans + + +is_link :: (ArrowXml a) => a XmlTree XmlTree +is_link = + isElem >>> hasName "a" + + +remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree +remove_comment_links = + processTopDown $ kill_comments `when` is_link + where + is_comment_link = + hasAttrValue "href" (contains "#Comments") + + kill_comments = + none `when` is_comment_link + +replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree +replace_links_with_spans = + processTopDown $ (make_span >>> remove_attrs) `when` is_link + where + make_span = setElemName $ mkName "span" + remove_attrs = processAttrl none + + +is_title :: (ArrowXml a) => a XmlTree XmlTree +is_title = + (hasName "h2") + >>> + (hasAttrValue "class" (== "SummaryHL")) + + +is_byline :: (ArrowXml a) => a XmlTree XmlTree +is_byline = + (hasName "div") + >>> + (hasAttrValue "class" (== "FeatureByLine")) + + +is_image :: (ArrowXml a) => a XmlTree XmlTree +is_image = isElem >>> hasName "img" + +remove_title :: (ArrowXml a) => a XmlTree XmlTree +remove_title = + processTopDown ((none) `when` is_title) + + +remove_byline :: (ArrowXml a) => a XmlTree XmlTree +remove_byline = + processTopDown ((none) `when` is_byline) + + +image_srcs :: (ArrowXml a) => a XmlTree URL +image_srcs = + css "img" + >>> + getAttrValue "src" + +make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree +make_image_srcs_absolute = + processTopDown (make_srcs_absolute `when` is_image) + where + change_src :: (ArrowXml a) => a XmlTree XmlTree + change_src = + changeAttrValue try_make_absolute_url + + make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree + make_srcs_absolute = + processAttrl $ change_src `when` hasName "src"