module Twitter.Xml
where
+import Data.Char (chr)
import Test.HUnit
-import Text.Regex (mkRegex, subRegex)
+import Text.Regex (matchRegex, mkRegex, subRegex)
import Text.XML.HaXml
-- |Returns the 'CharData' contained within the given 'Content', or
user_screen_name :: CFilter
user_screen_name = keep /> (tag "screen_name") /> txt
+-- |A wrapper around the 'read' function which returns either Nothing
+-- or (Just <the thing that could be read>).
+maybe_read :: (Read a) => String -> Maybe a
+maybe_read str =
+ case (reads str) of
+ [] -> Nothing
+ ((y,_):_) -> Just y
+
+-- |Takes a unicode codepoint in decimal and returns it as a
+-- one-character string.
+entity_from_codepoint :: String -> String
+entity_from_codepoint codepoint =
+ case (maybe_read codepoint) of
+ Nothing -> ""
+ Just num -> [(chr num)]
+
-- |A list of tuples whose first entry is a regular expression
-- matching XML entities, and whose second entry is the ASCII
("quot", "\""),
("[mn]dash", "-"),
("nbsp", " "),
- ("#8217", "'"),
("amp", "&"),
("lt", "<"),
("gt", ">"),
- ("#8230", "..."),
- ("hellip", "...")]
+ ("hellip", "…")]
-- |Replace all of the XML entities in target.
replace_entities :: String -> String
-replace_entities target = unescape_recursive xml_entities target
+replace_entities target =
+ unescape_numeric (unescape_recursive xml_entities target)
+
+-- |Recursively unescape all numeric entities in the given String.
+unescape_numeric :: String -> String
+unescape_numeric target =
+ case match of
+ Nothing -> target
+ Just subexprs ->
+ case subexprs of
+ [] -> target
+ s1:_ ->
+ let this_entity_regex = mkRegex ("&#" ++ s1 ++ ";") in
+ let replacement = entity_from_codepoint s1 in
+ let new_target = subRegex this_entity_regex target replacement in
+ unescape_numeric new_target
+ where
+ from = "&#([0-9]+);"
+ match = matchRegex (mkRegex from) target
+
+
-- |The recursive function which does the real work for
-- 'replace_entities'.
TestCase $ assertEqual "All entities are replaced correctly." expected_text actual_text
where
actual_text = (replace_entities ""The moon is gay……" said <insert the current president of the United States of America>. “It’s OK—–he’s not a real doctor.”")
- expected_text = "\"The moon is gay......\" said <insert the current president of the United States of America>. \"It's OK--he's not a real doctor.\""
+ expected_text = "\"The moon is gay……\" said <insert the current president of the United States of America>. \"It’s OK--he’s not a real doctor.\""