-- |Application-specific XML functions. module Twitter.Xml where import Data.Char (chr) import Test.HUnit import Text.Regex (matchRegex, mkRegex, subRegex) import Text.XML.HaXml -- |Returns the 'CharData' contained within the given 'Content', or -- 'Nothing' if no acceptable CharData was found. It will parse either -- a 'CString' ('String') or 'CRef' (XML entity reference). get_char_data :: Content i -> (Maybe CharData) get_char_data (CString _ cd _) = Just cd get_char_data (CRef ref _) = Just (verbatim ref) -- Entities. get_char_data _ = Nothing -- |A 'CFilter' returning all top-level elements. -- The name is due to the fact that if we retrieve more than -- one status, they will be wrapped in a tag, and -- thus not be top-level. single_status :: CFilter i single_status = (tag "status") -- |A 'CFilter' returning all tags within . all_statuses :: CFilter i all_statuses = (tag "statuses" /> tag "status") -- |Finds the text of the element contained within some other -- content. Called unique_id here because status_id is used elsewhere. unique_id :: CFilter i unique_id = keep /> (tag "id") /> txt -- |Finds the text of the element contained within some -- other element. status_created_at :: CFilter i status_created_at = keep /> (tag "created_at") /> txt -- |Finds the text of the element contained within some -- other element. status_text :: CFilter i status_text = keep /> (tag "text") /> txt -- |Finds the XML of the element contained within some other -- element. status_user :: CFilter i status_user = keep /> (tag "user") -- | Finds the text of the element contained within some -- other element. status_retweeted :: CFilter i status_retweeted = keep /> (tag "retweeted") /> txt -- | Finds the text of the element contained -- within some other element. status_reply_to_status_id :: CFilter i status_reply_to_status_id = keep /> (tag "in_reply_to_status_id") /> txt -- |Finds the text of the element contained within some -- other element. user_screen_name :: CFilter i user_screen_name = keep /> (tag "screen_name") /> txt -- |A wrapper around the 'read' function which returns either Nothing -- or (Just ). maybe_read :: (Read a) => String -> Maybe a maybe_read str = case (reads str) of [] -> Nothing ((y,_):_) -> Just y -- |Takes a unicode codepoint in decimal and returns it as a -- one-character string. entity_from_codepoint :: String -> String entity_from_codepoint codepoint = case (maybe_read codepoint) of Nothing -> "" Just num -> [(chr num)] -- |A list of tuples whose first entry is a regular expression -- matching XML entities, and whose second entry is the ASCII -- character represented by that entity. xml_entities :: [(String, String)] xml_entities = [("[lr]dquo", "\""), ("quot", "\""), ("[mn]dash", "-"), ("nbsp", " "), ("amp", "&"), ("lt", "<"), ("gt", ">"), ("hellip", "…")] -- |Replace all of the XML entities in target. replace_entities :: String -> String replace_entities target = unescape_numeric (unescape_recursive xml_entities target) -- |Recursively unescape all numeric entities in the given String. unescape_numeric :: String -> String unescape_numeric target = case match of Nothing -> target Just subexprs -> case subexprs of [] -> target s1:_ -> let this_entity_regex = mkRegex ("&#" ++ s1 ++ ";") in let replacement = entity_from_codepoint s1 in let new_target = subRegex this_entity_regex target replacement in unescape_numeric new_target where from = "&#([0-9]+);" match = matchRegex (mkRegex from) target -- |The recursive function which does the real work for -- 'replace_entities'. unescape_recursive :: [(String, String)] -> String -> String unescape_recursive [] target = target unescape_recursive replacements target = unescape_recursive (tail replacements) (subRegex (mkRegex from) target to) where replacement = (replacements !! 0) from = "&" ++ (fst replacement) ++ ";" to = (snd replacement) xml_tests :: [Test] xml_tests = [ test_replace_entities ] test_replace_entities :: Test test_replace_entities = TestCase $ assertEqual "All entities are replaced correctly." expected_text actual_text where actual_text = (replace_entities ""The moon is gay……" said <insert the current president of the United States of America>. “It’s OK—–he’s not a real doctor.”") expected_text = "\"The moon is gay……\" said . \"It’s OK--he’s not a real doctor.\""