X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Fhalcyon.git;a=blobdiff_plain;f=src%2FTwitter%2FXml.hs;h=0e23cb7e736905dcac1afc03bfd8c18f3cbd6341;hp=09013bf0cc9b623b7ca8ef3a12aeac31ea467cca;hb=636deff8c41fd640cc2945eae1d7b51432527a6a;hpb=17dd116706c4a971e1f5c68daa1656af5eff5cd2 diff --git a/src/Twitter/Xml.hs b/src/Twitter/Xml.hs index 09013bf..0e23cb7 100644 --- a/src/Twitter/Xml.hs +++ b/src/Twitter/Xml.hs @@ -1,48 +1,123 @@ +-- |Application-specific XML functions. module Twitter.Xml where -import Data.Maybe -import Text.Regex (mkRegex, subRegex) +import Data.Char (chr) +import Test.HUnit +import Text.Regex (matchRegex, mkRegex, subRegex) import Text.XML.HaXml -get_char_data :: Content -> (Maybe CharData) -get_char_data (CString _ cd) = Just cd -get_char_data (CRef ref) = Just (verbatim ref) -- Entities. +-- |Returns the 'CharData' contained within the given 'Content', or +-- 'Nothing' if no acceptable CharData was found. It will parse either +-- a 'CString' ('String') or 'CRef' (XML entity reference). +get_char_data :: Content i -> (Maybe CharData) +get_char_data (CString _ cd _) = Just cd +get_char_data (CRef ref _) = Just (verbatim ref) -- Entities. get_char_data _ = Nothing -all_statuses :: CFilter +-- |A 'CFilter' returning all top-level elements. +-- The name is due to the fact that if we retrieve more than +-- one status, they will be wrapped in a tag, and +-- thus not be top-level. +single_status :: CFilter i +single_status = (tag "status") + +-- |A 'CFilter' returning all tags within . +all_statuses :: CFilter i all_statuses = (tag "statuses" /> tag "status") --- Called unique_id here because status_id is used elsewhere. -unique_id :: CFilter +-- |Finds the text of the element contained within some other +-- content. Called unique_id here because status_id is used elsewhere. +unique_id :: CFilter i unique_id = keep /> (tag "id") /> txt -status_created_at :: CFilter +-- |Finds the text of the element contained within some +-- other element. +status_created_at :: CFilter i status_created_at = keep /> (tag "created_at") /> txt -status_text :: CFilter +-- |Finds the text of the element contained within some +-- other element. +status_text :: CFilter i status_text = keep /> (tag "text") /> txt -status_user :: CFilter +-- |Finds the XML of the element contained within some other +-- element. +status_user :: CFilter i status_user = keep /> (tag "user") - -user_screen_name :: CFilter + +-- | Finds the text of the element contained within some +-- other element. +status_retweeted :: CFilter i +status_retweeted = keep /> (tag "retweeted") /> txt + +-- | Finds the text of the element contained +-- within some other element. +status_reply_to_status_id :: CFilter i +status_reply_to_status_id = keep /> (tag "in_reply_to_status_id") /> txt + +-- |Finds the text of the element contained within some +-- other element. +user_screen_name :: CFilter i user_screen_name = keep /> (tag "screen_name") /> txt +-- |A wrapper around the 'read' function which returns either Nothing +-- or (Just ). +maybe_read :: (Read a) => String -> Maybe a +maybe_read str = + case (reads str) of + [] -> Nothing + ((y,_):_) -> Just y +-- |Takes a unicode codepoint in decimal and returns it as a +-- one-character string. +entity_from_codepoint :: String -> String +entity_from_codepoint codepoint = + case (maybe_read codepoint) of + Nothing -> "" + Just num -> [(chr num)] + + +-- |A list of tuples whose first entry is a regular expression +-- matching XML entities, and whose second entry is the ASCII +-- character represented by that entity. xml_entities :: [(String, String)] xml_entities = [("[lr]dquo", "\""), + ("quot", "\""), ("[mn]dash", "-"), ("nbsp", " "), - ("#8217", "'"), ("amp", "&"), ("lt", "<"), - ("gt", ">")] + ("gt", ">"), + ("hellip", "…")] +-- |Replace all of the XML entities in target. replace_entities :: String -> String -replace_entities target = unescape_recursive xml_entities target +replace_entities target = + unescape_numeric (unescape_recursive xml_entities target) + +-- |Recursively unescape all numeric entities in the given String. +unescape_numeric :: String -> String +unescape_numeric target = + case match of + Nothing -> target + Just subexprs -> + case subexprs of + [] -> target + s1:_ -> + let this_entity_regex = mkRegex ("&#" ++ s1 ++ ";") in + let replacement = entity_from_codepoint s1 in + let new_target = subRegex this_entity_regex target replacement in + unescape_numeric new_target + where + from = "&#([0-9]+);" + match = matchRegex (mkRegex from) target + + +-- |The recursive function which does the real work for +-- 'replace_entities'. unescape_recursive :: [(String, String)] -> String -> String unescape_recursive [] target = target unescape_recursive replacements target = @@ -51,3 +126,16 @@ unescape_recursive replacements target = replacement = (replacements !! 0) from = "&" ++ (fst replacement) ++ ";" to = (snd replacement) + + + +xml_tests :: [Test] +xml_tests = [ test_replace_entities ] + + +test_replace_entities :: Test +test_replace_entities = + TestCase $ assertEqual "All entities are replaced correctly." expected_text actual_text + where + actual_text = (replace_entities ""The moon is gay……" said <insert the current president of the United States of America>. “It’s OK—–he’s not a real doctor.”") + expected_text = "\"The moon is gay……\" said . \"It’s OK--he’s not a real doctor.\""