X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=src%2FLWN%2FXHTML.hs;h=f1a91c249b41608e764f15ca7e8922f06afa2a7c;hb=6a7cfdf0880ee5c5367e794babb30fa7eac22f39;hp=8dfe3b2e85645c451b601a58af96107f571e589b;hpb=aad40cd8e1e8c84c5fc294674a7159bb40838440;p=dead%2Flwn-epub.git
diff --git a/src/LWN/XHTML.hs b/src/LWN/XHTML.hs
index 8dfe3b2..f1a91c2 100644
--- a/src/LWN/XHTML.hs
+++ b/src/LWN/XHTML.hs
@@ -1,28 +1,38 @@
module LWN.XHTML (
XHTML,
XML,
+ full_story_urls,
image_srcs,
+ full_story_link,
+ full_story_paragraph,
is_image,
parse_lwn,
preprocess,
remove_byline,
+ remove_full_story_paragraphs,
remove_title,
to_xhtml,
to_xml,
xml_from_contents)
where
+import Data.String.Utils (replace)
import Text.HandsomeSoup (css)
import Text.XML.HXT.Core (
(>>>),
+ (/>),
ArrowXml,
IOStateArrow,
SysConfigList,
XmlTree,
changeAttrValue,
+ changeText,
+ deep,
getAttrValue,
hasAttrValue,
hasName,
+ hasText,
+ ifA,
isElem,
mkName,
no,
@@ -31,12 +41,14 @@ import Text.XML.HXT.Core (
processTopDown,
readString,
setElemName,
+ this,
when,
withParseHTML,
withValidate,
withWarnings,
yes)
+
import LWN.URI (URL, try_make_absolute_url)
import Misc (contains)
@@ -74,12 +86,32 @@ preprocess =
remove_comment_links
>>>
replace_links_with_spans
+ >>>
+ replace_double_newline_in_pre
is_link :: (ArrowXml a) => a XmlTree XmlTree
is_link =
isElem >>> hasName "a"
+is_pre :: (ArrowXml a) => a XmlTree XmlTree
+is_pre =
+ isElem
+ >>>
+ hasName "pre"
+
+replace_double_newline_in_pre :: (ArrowXml a) => a XmlTree XmlTree
+replace_double_newline_in_pre =
+ processTopDown $ fix_it `when` is_pre
+ where
+ fix_it :: (ArrowXml a) => a XmlTree XmlTree
+ fix_it =
+ changeText $ replace "\n\n" "
"
+
+remove_full_story_paragraphs :: (ArrowXml a) => a XmlTree XmlTree
+remove_full_story_paragraphs =
+ processTopDown $ none `when` full_story_paragraph
+
remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
remove_comment_links =
@@ -132,6 +164,44 @@ image_srcs =
>>>
getAttrValue "src"
+
+full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
+full_story_paragraph =
+ isElem
+ >>>
+ hasName "p"
+ >>>
+ ifA
+ (this /> full_story_link)
+ this
+ none
+
+
+-- Without regard to the parent paragraph.
+full_story_link :: (ArrowXml a) => a XmlTree XmlTree
+full_story_link =
+ isElem
+ >>>
+ hasName "a"
+ >>>
+ ifA
+ (this /> hasText (== "Full Story"))
+ this
+ none
+
+
+-- | Get the hrefs of all full story links.
+full_story_urls :: (ArrowXml a) => a XmlTree URL
+full_story_urls =
+ deep $
+ full_story_paragraph
+ />
+ full_story_link
+ >>>
+ getAttrValue "href"
+
+
+
make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
make_image_srcs_absolute =
processTopDown (make_srcs_absolute `when` is_image)