X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FXHTML.hs;h=f1a91c249b41608e764f15ca7e8922f06afa2a7c;hp=8dfe3b2e85645c451b601a58af96107f571e589b;hb=b054d19e638a4983f88243e09f1aa474ae405998;hpb=aad40cd8e1e8c84c5fc294674a7159bb40838440 diff --git a/src/LWN/XHTML.hs b/src/LWN/XHTML.hs index 8dfe3b2..f1a91c2 100644 --- a/src/LWN/XHTML.hs +++ b/src/LWN/XHTML.hs @@ -1,28 +1,38 @@ module LWN.XHTML ( XHTML, XML, + full_story_urls, image_srcs, + full_story_link, + full_story_paragraph, is_image, parse_lwn, preprocess, remove_byline, + remove_full_story_paragraphs, remove_title, to_xhtml, to_xml, xml_from_contents) where +import Data.String.Utils (replace) import Text.HandsomeSoup (css) import Text.XML.HXT.Core ( (>>>), + (/>), ArrowXml, IOStateArrow, SysConfigList, XmlTree, changeAttrValue, + changeText, + deep, getAttrValue, hasAttrValue, hasName, + hasText, + ifA, isElem, mkName, no, @@ -31,12 +41,14 @@ import Text.XML.HXT.Core ( processTopDown, readString, setElemName, + this, when, withParseHTML, withValidate, withWarnings, yes) + import LWN.URI (URL, try_make_absolute_url) import Misc (contains) @@ -74,12 +86,32 @@ preprocess = remove_comment_links >>> replace_links_with_spans + >>> + replace_double_newline_in_pre is_link :: (ArrowXml a) => a XmlTree XmlTree is_link = isElem >>> hasName "a" +is_pre :: (ArrowXml a) => a XmlTree XmlTree +is_pre = + isElem + >>> + hasName "pre" + +replace_double_newline_in_pre :: (ArrowXml a) => a XmlTree XmlTree +replace_double_newline_in_pre = + processTopDown $ fix_it `when` is_pre + where + fix_it :: (ArrowXml a) => a XmlTree XmlTree + fix_it = + changeText $ replace "\n\n" "

" + +remove_full_story_paragraphs :: (ArrowXml a) => a XmlTree XmlTree +remove_full_story_paragraphs = + processTopDown $ none `when` full_story_paragraph + remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree remove_comment_links = @@ -132,6 +164,44 @@ image_srcs = >>> getAttrValue "src" + +full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree +full_story_paragraph = + isElem + >>> + hasName "p" + >>> + ifA + (this /> full_story_link) + this + none + + +-- Without regard to the parent paragraph. +full_story_link :: (ArrowXml a) => a XmlTree XmlTree +full_story_link = + isElem + >>> + hasName "a" + >>> + ifA + (this /> hasText (== "Full Story")) + this + none + + +-- | Get the hrefs of all full story links. +full_story_urls :: (ArrowXml a) => a XmlTree URL +full_story_urls = + deep $ + full_story_paragraph + /> + full_story_link + >>> + getAttrValue "href" + + + make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree make_image_srcs_absolute = processTopDown (make_srcs_absolute `when` is_image)