X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Fhalcyon.git;a=blobdiff_plain;f=src%2FTwitter%2FXml.hs;fp=src%2FTwitter%2FXml.hs;h=0000000000000000000000000000000000000000;hp=871d2164dee05737e9861d45c2e400dfde76e741;hb=9b6d95a82745ced2a58d9bc4ded555ee36b36673;hpb=81f6cb2ec955695d8d1a4619dab69e8fa4b3fb27 diff --git a/src/Twitter/Xml.hs b/src/Twitter/Xml.hs deleted file mode 100644 index 871d216..0000000 --- a/src/Twitter/Xml.hs +++ /dev/null @@ -1,154 +0,0 @@ --- |Application-specific XML functions. -module Twitter.Xml -where - -import Data.Char (chr) -import Test.HUnit -import Text.Regex (matchRegex, mkRegex, subRegex) -import Text.XML.HaXml - --- |Returns the 'CharData' contained within the given 'Content', or --- 'Nothing' if no acceptable CharData was found. It will parse either --- a 'CString' ('String') or 'CRef' (XML entity reference). -get_char_data :: Content i -> (Maybe CharData) -get_char_data (CString _ cd _) = Just cd -get_char_data (CRef ref _) = Just (verbatim ref) -- Entities. -get_char_data _ = Nothing - - --- |A 'CFilter' returning all top-level elements. --- The name is due to the fact that if we retrieve more than --- one status, they will be wrapped in a tag, and --- thus not be top-level. -single_status :: CFilter i -single_status = (tag "status") - --- |A 'CFilter' returning all tags within . -all_statuses :: CFilter i -all_statuses = (tag "statuses" /> tag "status") - --- |Finds the text of the element contained within some other --- content. Called unique_id here because status_id is used elsewhere. -unique_id :: CFilter i -unique_id = keep /> (tag "id") /> txt - --- |Finds the text of the element contained within some --- other element. -status_created_at :: CFilter i -status_created_at = keep /> (tag "created_at") /> txt - --- |Finds the text of the element contained within some --- other element. -status_text :: CFilter i -status_text = keep /> (tag "text") /> txt - --- |Finds the XML of the element contained within some other --- element. -status_user :: CFilter i -status_user = keep /> (tag "user") - --- | Finds the text of the element contained within some --- other element. -status_retweeted :: CFilter i -status_retweeted = keep /> (tag "retweeted") /> txt - --- | Finds the text of the element contained --- within some other element. -status_reply_to_status_id :: CFilter i -status_reply_to_status_id = keep /> (tag "in_reply_to_status_id") /> txt - --- |Finds the text of the element contained within some --- other element. -user_screen_name :: CFilter i -user_screen_name = keep /> (tag "screen_name") /> txt - --- |A wrapper around the 'read' function which returns either Nothing --- or (Just ). -maybe_read :: (Read a) => String -> Maybe a -maybe_read str = - case (reads str) of - [] -> Nothing - ((y,_):_) -> Just y - --- |Takes a unicode codepoint in decimal and returns it as a --- one-character string. -entity_from_codepoint :: String -> String -entity_from_codepoint codepoint = - case (maybe_read codepoint) of - Nothing -> "" - Just num -> [(chr num)] - - --- | A list of tuples whose first entry is a regular expression --- matching XML entities, and whose second entry is the ASCII --- character represented by that entity. --- --- For some reason, ampersands are escaped twice in the status --- text. Rather than unescape everything twice, we just stick "amp" --- in the list again. -xml_entities :: [(String, String)] -xml_entities = [("[lr]dquo", "\""), - ("quot", "\""), - ("[mn]dash", "-"), - ("nbsp", " "), - ("amp", "&"), - ("amp", "&"), - ("lt", "<"), - ("gt", ">"), - ("hellip", "…")] - --- |Replace all of the XML entities in target. -replace_entities :: String -> String -replace_entities target = - unescape_numeric (unescape_recursive xml_entities target) - --- |Recursively unescape all numeric entities in the given String. -unescape_numeric :: String -> String -unescape_numeric target = - case match of - Nothing -> target - Just subexprs -> - case subexprs of - [] -> target - s1:_ -> - let this_entity_regex = mkRegex ("&#" ++ s1 ++ ";") in - let replacement = entity_from_codepoint s1 in - let new_target = subRegex this_entity_regex target replacement in - unescape_numeric new_target - where - from = "&#([0-9]+);" - match = matchRegex (mkRegex from) target - - - --- |The recursive function which does the real work for --- 'replace_entities'. -unescape_recursive :: [(String, String)] -> String -> String -unescape_recursive [] target = target -unescape_recursive replacements target = - unescape_recursive (tail replacements) (subRegex (mkRegex from) target to) - where - replacement = (replacements !! 0) - from = "&" ++ (fst replacement) ++ ";" - to = (snd replacement) - - - -xml_tests :: [Test] -xml_tests = [ test_replace_entities, test_double_unescape ] - - -test_replace_entities :: Test -test_replace_entities = - TestCase $ assertEqual "All entities are replaced correctly." expected_text actual_text - where - actual_text = (replace_entities ""The moon is gay……" said <insert the current president of the United States of America>. “It’s OK—–he’s not a real doctor.”") - expected_text = "\"The moon is gay……\" said . \"It’s OK--he’s not a real doctor.\"" - - -test_double_unescape :: Test -test_double_unescape = - TestCase $ assertEqual "The status text is unescaped twice." expected_text actual_text - where - actual_text = (replace_entities "As a kid, I'd pull a girl's hair to let her know I liked her, but now that I'm older &amp; wiser I simply hit her with my car.") - expected_text = "As a kid, I'd pull a girl's hair to let her know I liked her, but now that I'm older & wiser I simply hit her with my car."