1b1ea48e9015ce1732e6898d4dcbfdf7b98b3176
[dead/halcyon.git] / src / Twitter / Xml.hs
1 -- |Application-specific XML functions.
2 module Twitter.Xml
3 where
4
5 import Test.HUnit
6 import Text.Regex (mkRegex, subRegex)
7 import Text.XML.HaXml
8
9 -- |Returns the 'CharData' contained within the given 'Content', or
10 -- 'Nothing' if no acceptable CharData was found. It will parse either
11 -- a 'CString' ('String') or 'CRef' (XML entity reference).
12 get_char_data :: Content -> (Maybe CharData)
13 get_char_data (CString _ cd) = Just cd
14 get_char_data (CRef ref) = Just (verbatim ref) -- Entities.
15 get_char_data _ = Nothing
16
17
18 -- |A 'CFilter' returning all top-level <status> elements.
19 -- The name is due to the fact that if we retrieve more than
20 -- one status, they will be wrapped in a <statuses> tag, and
21 -- thus not be top-level.
22 single_status :: CFilter
23 single_status = (tag "status")
24
25 -- |A 'CFilter' returning all <status> tags within <statuses>.
26 all_statuses :: CFilter
27 all_statuses = (tag "statuses" /> tag "status")
28
29 -- |Finds the text of the <id> element contained within some other
30 -- content. Called unique_id here because status_id is used elsewhere.
31 unique_id :: CFilter
32 unique_id = keep /> (tag "id") /> txt
33
34 -- |Finds the text of the <created_at> element contained within some
35 -- other element.
36 status_created_at :: CFilter
37 status_created_at = keep /> (tag "created_at") /> txt
38
39 -- |Finds the text of the <text> element contained within some
40 -- other element.
41 status_text :: CFilter
42 status_text = keep /> (tag "text") /> txt
43
44 -- |Finds the XML of the <user> element contained within some other
45 -- element.
46 status_user :: CFilter
47 status_user = keep /> (tag "user")
48
49 -- |Finds the text of the <screen_name> element contained within some
50 -- other element.
51 user_screen_name :: CFilter
52 user_screen_name = keep /> (tag "screen_name") /> txt
53
54
55 -- |A list of tuples whose first entry is a regular expression
56 -- matching XML entities, and whose second entry is the ASCII
57 -- character represented by that entity.
58 xml_entities :: [(String, String)]
59 xml_entities = [("[lr]dquo", "\""),
60 ("quot", "\""),
61 ("[mn]dash", "-"),
62 ("nbsp", " "),
63 ("#8217", "'"),
64 ("amp", "&"),
65 ("lt", "<"),
66 ("gt", ">"),
67 ("#8230", "..."),
68 ("hellip", "...")]
69
70 -- |Replace all of the XML entities in target.
71 replace_entities :: String -> String
72 replace_entities target = unescape_recursive xml_entities target
73
74 -- |The recursive function which does the real work for
75 -- 'replace_entities'.
76 unescape_recursive :: [(String, String)] -> String -> String
77 unescape_recursive [] target = target
78 unescape_recursive replacements target =
79 unescape_recursive (tail replacements) (subRegex (mkRegex from) target to)
80 where
81 replacement = (replacements !! 0)
82 from = "&" ++ (fst replacement) ++ ";"
83 to = (snd replacement)
84
85
86
87 xml_tests :: [Test]
88 xml_tests = [ test_replace_entities ]
89
90
91 test_replace_entities :: Test
92 test_replace_entities =
93 TestCase $ assertEqual "All entities are replaced correctly." expected_text actual_text
94 where
95 actual_text = (replace_entities "&quot;The moon is gay&#8230;&hellip;&quot; said &lt;insert the current president of the United States of America&gt;. &ldquo;It&#8217;s OK&mdash;&ndash;he&#8217;s not a real doctor.&rdquo;")
96 expected_text = "\"The moon is gay......\" said <insert the current president of the United States of America>. \"It's OK--he's not a real doctor.\""