8dfe3b2e85645c451b601a58af96107f571e589b
[dead/lwn-epub.git] / src / LWN / XHTML.hs
1 module LWN.XHTML (
2 XHTML,
3 XML,
4 image_srcs,
5 is_image,
6 parse_lwn,
7 preprocess,
8 remove_byline,
9 remove_title,
10 to_xhtml,
11 to_xml,
12 xml_from_contents)
13 where
14
15 import Text.HandsomeSoup (css)
16 import Text.XML.HXT.Core (
17 (>>>),
18 ArrowXml,
19 IOStateArrow,
20 SysConfigList,
21 XmlTree,
22 changeAttrValue,
23 getAttrValue,
24 hasAttrValue,
25 hasName,
26 isElem,
27 mkName,
28 no,
29 none,
30 processAttrl,
31 processTopDown,
32 readString,
33 setElemName,
34 when,
35 withParseHTML,
36 withValidate,
37 withWarnings,
38 yes)
39
40 import LWN.URI (URL, try_make_absolute_url)
41 import Misc (contains)
42
43
44 class XHTML a where
45 to_xhtml :: a -> String
46
47 class XML a where
48 to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
49
50 -- | Options used when parsing HTML.
51 my_read_opts :: SysConfigList
52 my_read_opts = [ withValidate no,
53 withParseHTML yes,
54 withWarnings no ]
55
56 -- | My version of HandsomeSoup's parseHTML.
57 parse_lwn :: String -> IOStateArrow s b XmlTree
58 parse_lwn = readString my_read_opts
59
60
61 -- | Takes the result of get_article_contents and calls parse_lwn on
62 -- the contained value.
63 xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
64 xml_from_contents =
65 fmap parse_lwn
66
67
68
69 -- | Preprocessing common to both page types.
70 preprocess :: (ArrowXml a) => a XmlTree XmlTree
71 preprocess =
72 make_image_srcs_absolute
73 >>>
74 remove_comment_links
75 >>>
76 replace_links_with_spans
77
78
79 is_link :: (ArrowXml a) => a XmlTree XmlTree
80 is_link =
81 isElem >>> hasName "a"
82
83
84 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
85 remove_comment_links =
86 processTopDown $ kill_comments `when` is_link
87 where
88 is_comment_link =
89 hasAttrValue "href" (contains "#Comments")
90
91 kill_comments =
92 none `when` is_comment_link
93
94 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
95 replace_links_with_spans =
96 processTopDown $ (make_span >>> remove_attrs) `when` is_link
97 where
98 make_span = setElemName $ mkName "span"
99 remove_attrs = processAttrl none
100
101
102 is_title :: (ArrowXml a) => a XmlTree XmlTree
103 is_title =
104 (hasName "h2")
105 >>>
106 (hasAttrValue "class" (== "SummaryHL"))
107
108
109 is_byline :: (ArrowXml a) => a XmlTree XmlTree
110 is_byline =
111 (hasName "div")
112 >>>
113 (hasAttrValue "class" (== "FeatureByLine"))
114
115
116 is_image :: (ArrowXml a) => a XmlTree XmlTree
117 is_image = isElem >>> hasName "img"
118
119 remove_title :: (ArrowXml a) => a XmlTree XmlTree
120 remove_title =
121 processTopDown ((none) `when` is_title)
122
123
124 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
125 remove_byline =
126 processTopDown ((none) `when` is_byline)
127
128
129 image_srcs :: (ArrowXml a) => a XmlTree URL
130 image_srcs =
131 css "img"
132 >>>
133 getAttrValue "src"
134
135 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
136 make_image_srcs_absolute =
137 processTopDown (make_srcs_absolute `when` is_image)
138 where
139 change_src :: (ArrowXml a) => a XmlTree XmlTree
140 change_src =
141 changeAttrValue try_make_absolute_url
142
143 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
144 make_srcs_absolute =
145 processAttrl $ change_src `when` hasName "src"