]> gitweb.michael.orlitzky.com - dead/halcyon.git/blob - src/Twitter/Xml.hs
0e23cb7e736905dcac1afc03bfd8c18f3cbd6341
[dead/halcyon.git] / src / Twitter / Xml.hs
1 -- |Application-specific XML functions.
2 module Twitter.Xml
3 where
4
5 import Data.Char (chr)
6 import Test.HUnit
7 import Text.Regex (matchRegex, mkRegex, subRegex)
8 import Text.XML.HaXml
9
10 -- |Returns the 'CharData' contained within the given 'Content', or
11 -- 'Nothing' if no acceptable CharData was found. It will parse either
12 -- a 'CString' ('String') or 'CRef' (XML entity reference).
13 get_char_data :: Content i -> (Maybe CharData)
14 get_char_data (CString _ cd _) = Just cd
15 get_char_data (CRef ref _) = Just (verbatim ref) -- Entities.
16 get_char_data _ = Nothing
17
18
19 -- |A 'CFilter' returning all top-level <status> elements.
20 -- The name is due to the fact that if we retrieve more than
21 -- one status, they will be wrapped in a <statuses> tag, and
22 -- thus not be top-level.
23 single_status :: CFilter i
24 single_status = (tag "status")
25
26 -- |A 'CFilter' returning all <status> tags within <statuses>.
27 all_statuses :: CFilter i
28 all_statuses = (tag "statuses" /> tag "status")
29
30 -- |Finds the text of the <id> element contained within some other
31 -- content. Called unique_id here because status_id is used elsewhere.
32 unique_id :: CFilter i
33 unique_id = keep /> (tag "id") /> txt
34
35 -- |Finds the text of the <created_at> element contained within some
36 -- other element.
37 status_created_at :: CFilter i
38 status_created_at = keep /> (tag "created_at") /> txt
39
40 -- |Finds the text of the <text> element contained within some
41 -- other element.
42 status_text :: CFilter i
43 status_text = keep /> (tag "text") /> txt
44
45 -- |Finds the XML of the <user> element contained within some other
46 -- element.
47 status_user :: CFilter i
48 status_user = keep /> (tag "user")
49
50 -- | Finds the text of the <retweeted> element contained within some
51 -- other element.
52 status_retweeted :: CFilter i
53 status_retweeted = keep /> (tag "retweeted") /> txt
54
55 -- | Finds the text of the <in_reply_to_status_id> element contained
56 -- within some other element.
57 status_reply_to_status_id :: CFilter i
58 status_reply_to_status_id = keep /> (tag "in_reply_to_status_id") /> txt
59
60 -- |Finds the text of the <screen_name> element contained within some
61 -- other element.
62 user_screen_name :: CFilter i
63 user_screen_name = keep /> (tag "screen_name") /> txt
64
65 -- |A wrapper around the 'read' function which returns either Nothing
66 -- or (Just <the thing that could be read>).
67 maybe_read :: (Read a) => String -> Maybe a
68 maybe_read str =
69 case (reads str) of
70 [] -> Nothing
71 ((y,_):_) -> Just y
72
73 -- |Takes a unicode codepoint in decimal and returns it as a
74 -- one-character string.
75 entity_from_codepoint :: String -> String
76 entity_from_codepoint codepoint =
77 case (maybe_read codepoint) of
78 Nothing -> ""
79 Just num -> [(chr num)]
80
81
82 -- |A list of tuples whose first entry is a regular expression
83 -- matching XML entities, and whose second entry is the ASCII
84 -- character represented by that entity.
85 xml_entities :: [(String, String)]
86 xml_entities = [("[lr]dquo", "\""),
87 ("quot", "\""),
88 ("[mn]dash", "-"),
89 ("nbsp", " "),
90 ("amp", "&"),
91 ("lt", "<"),
92 ("gt", ">"),
93 ("hellip", "…")]
94
95 -- |Replace all of the XML entities in target.
96 replace_entities :: String -> String
97 replace_entities target =
98 unescape_numeric (unescape_recursive xml_entities target)
99
100 -- |Recursively unescape all numeric entities in the given String.
101 unescape_numeric :: String -> String
102 unescape_numeric target =
103 case match of
104 Nothing -> target
105 Just subexprs ->
106 case subexprs of
107 [] -> target
108 s1:_ ->
109 let this_entity_regex = mkRegex ("&#" ++ s1 ++ ";") in
110 let replacement = entity_from_codepoint s1 in
111 let new_target = subRegex this_entity_regex target replacement in
112 unescape_numeric new_target
113 where
114 from = "&#([0-9]+);"
115 match = matchRegex (mkRegex from) target
116
117
118
119 -- |The recursive function which does the real work for
120 -- 'replace_entities'.
121 unescape_recursive :: [(String, String)] -> String -> String
122 unescape_recursive [] target = target
123 unescape_recursive replacements target =
124 unescape_recursive (tail replacements) (subRegex (mkRegex from) target to)
125 where
126 replacement = (replacements !! 0)
127 from = "&" ++ (fst replacement) ++ ";"
128 to = (snd replacement)
129
130
131
132 xml_tests :: [Test]
133 xml_tests = [ test_replace_entities ]
134
135
136 test_replace_entities :: Test
137 test_replace_entities =
138 TestCase $ assertEqual "All entities are replaced correctly." expected_text actual_text
139 where
140 actual_text = (replace_entities "&quot;The moon is gay&#8230;&hellip;&quot; said &lt;insert the current president of the United States of America&gt;. &ldquo;It&#8217;s OK&mdash;&ndash;he&#8217;s not a real doctor.&rdquo;")
141 expected_text = "\"The moon is gay……\" said <insert the current president of the United States of America>. \"It’s OK--he’s not a real doctor.\""