+-- |Application-specific XML functions.
module Twitter.Xml
where
-import Data.Maybe
+import Test.HUnit
import Text.Regex (mkRegex, subRegex)
import Text.XML.HaXml
+-- |Returns the 'CharData' contained within the given 'Content', or
+-- 'Nothing' if no acceptable CharData was found. It will parse either
+-- a 'CString' ('String') or 'CRef' (XML entity reference).
get_char_data :: Content -> (Maybe CharData)
get_char_data (CString _ cd) = Just cd
get_char_data (CRef ref) = Just (verbatim ref) -- Entities.
get_char_data _ = Nothing
+-- |A 'CFilter' returning all top-level <status> elements.
+-- The name is due to the fact that if we retrieve more than
+-- one status, they will be wrapped in a <statuses> tag, and
+-- thus not be top-level.
+single_status :: CFilter
+single_status = (tag "status")
+
+-- |A 'CFilter' returning all <status> tags within <statuses>.
all_statuses :: CFilter
all_statuses = (tag "statuses" /> tag "status")
--- Called unique_id here because status_id is used elsewhere.
+-- |Finds the text of the <id> element contained within some other
+-- content. Called unique_id here because status_id is used elsewhere.
unique_id :: CFilter
unique_id = keep /> (tag "id") /> txt
+-- |Finds the text of the <created_at> element contained within some
+-- other element.
status_created_at :: CFilter
status_created_at = keep /> (tag "created_at") /> txt
+-- |Finds the text of the <text> element contained within some
+-- other element.
status_text :: CFilter
status_text = keep /> (tag "text") /> txt
+-- |Finds the XML of the <user> element contained within some other
+-- element.
status_user :: CFilter
status_user = keep /> (tag "user")
-
+
+-- |Finds the text of the <screen_name> element contained within some
+-- other element.
user_screen_name :: CFilter
user_screen_name = keep /> (tag "screen_name") /> txt
+-- |A list of tuples whose first entry is a regular expression
+-- matching XML entities, and whose second entry is the ASCII
+-- character represented by that entity.
xml_entities :: [(String, String)]
xml_entities = [("[lr]dquo", "\""),
+ ("quot", "\""),
("[mn]dash", "-"),
("nbsp", " "),
("#8217", "'"),
("amp", "&"),
("lt", "<"),
- ("gt", ">")]
+ ("gt", ">"),
+ ("#8230", "..."),
+ ("hellip", "...")]
+-- |Replace all of the XML entities in target.
replace_entities :: String -> String
replace_entities target = unescape_recursive xml_entities target
+-- |The recursive function which does the real work for
+-- 'replace_entities'.
unescape_recursive :: [(String, String)] -> String -> String
unescape_recursive [] target = target
unescape_recursive replacements target =
replacement = (replacements !! 0)
from = "&" ++ (fst replacement) ++ ";"
to = (snd replacement)
+
+
+
+xml_tests :: [Test]
+xml_tests = [ test_replace_entities ]
+
+
+test_replace_entities :: Test
+test_replace_entities =
+ TestCase $ assertEqual "All entities are replaced correctly." expected_text actual_text
+ where
+ actual_text = (replace_entities ""The moon is gay……" said <insert the current president of the United States of America>. “It’s OK—–he’s not a real doctor.”")
+ expected_text = "\"The moon is gay......\" said <insert the current president of the United States of America>. \"It's OK--he's not a real doctor.\""