X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Fhalcyon.git;a=blobdiff_plain;f=src%2FTwitter%2FXml.hs;h=871d2164dee05737e9861d45c2e400dfde76e741;hp=e284a0da5ae6601c2a8fac8ee0a9730989325773;hb=81f6cb2ec955695d8d1a4619dab69e8fa4b3fb27;hpb=ee2deb5a51e416d607cce45ddd10d4e19c050771 diff --git a/src/Twitter/Xml.hs b/src/Twitter/Xml.hs index e284a0d..871d216 100644 --- a/src/Twitter/Xml.hs +++ b/src/Twitter/Xml.hs @@ -10,9 +10,9 @@ import Text.XML.HaXml -- |Returns the 'CharData' contained within the given 'Content', or -- 'Nothing' if no acceptable CharData was found. It will parse either -- a 'CString' ('String') or 'CRef' (XML entity reference). -get_char_data :: Content -> (Maybe CharData) -get_char_data (CString _ cd) = Just cd -get_char_data (CRef ref) = Just (verbatim ref) -- Entities. +get_char_data :: Content i -> (Maybe CharData) +get_char_data (CString _ cd _) = Just cd +get_char_data (CRef ref _) = Just (verbatim ref) -- Entities. get_char_data _ = Nothing @@ -20,36 +20,46 @@ get_char_data _ = Nothing -- The name is due to the fact that if we retrieve more than -- one status, they will be wrapped in a tag, and -- thus not be top-level. -single_status :: CFilter +single_status :: CFilter i single_status = (tag "status") -- |A 'CFilter' returning all tags within . -all_statuses :: CFilter +all_statuses :: CFilter i all_statuses = (tag "statuses" /> tag "status") -- |Finds the text of the element contained within some other -- content. Called unique_id here because status_id is used elsewhere. -unique_id :: CFilter +unique_id :: CFilter i unique_id = keep /> (tag "id") /> txt -- |Finds the text of the element contained within some -- other element. -status_created_at :: CFilter +status_created_at :: CFilter i status_created_at = keep /> (tag "created_at") /> txt -- |Finds the text of the element contained within some -- other element. -status_text :: CFilter +status_text :: CFilter i status_text = keep /> (tag "text") /> txt -- |Finds the XML of the element contained within some other -- element. -status_user :: CFilter +status_user :: CFilter i status_user = keep /> (tag "user") +-- | Finds the text of the element contained within some +-- other element. +status_retweeted :: CFilter i +status_retweeted = keep /> (tag "retweeted") /> txt + +-- | Finds the text of the element contained +-- within some other element. +status_reply_to_status_id :: CFilter i +status_reply_to_status_id = keep /> (tag "in_reply_to_status_id") /> txt + -- |Finds the text of the element contained within some -- other element. -user_screen_name :: CFilter +user_screen_name :: CFilter i user_screen_name = keep /> (tag "screen_name") /> txt -- |A wrapper around the 'read' function which returns either Nothing @@ -69,15 +79,20 @@ entity_from_codepoint codepoint = Just num -> [(chr num)] --- |A list of tuples whose first entry is a regular expression --- matching XML entities, and whose second entry is the ASCII --- character represented by that entity. +-- | A list of tuples whose first entry is a regular expression +-- matching XML entities, and whose second entry is the ASCII +-- character represented by that entity. +-- +-- For some reason, ampersands are escaped twice in the status +-- text. Rather than unescape everything twice, we just stick "amp" +-- in the list again. xml_entities :: [(String, String)] xml_entities = [("[lr]dquo", "\""), ("quot", "\""), ("[mn]dash", "-"), ("nbsp", " "), ("amp", "&"), + ("amp", "&"), ("lt", "<"), ("gt", ">"), ("hellip", "…")] @@ -120,7 +135,7 @@ unescape_recursive replacements target = xml_tests :: [Test] -xml_tests = [ test_replace_entities ] +xml_tests = [ test_replace_entities, test_double_unescape ] test_replace_entities :: Test @@ -129,3 +144,11 @@ test_replace_entities = where actual_text = (replace_entities ""The moon is gay……" said <insert the current president of the United States of America>. “It’s OK—–he’s not a real doctor.”") expected_text = "\"The moon is gay……\" said . \"It’s OK--he’s not a real doctor.\"" + + +test_double_unescape :: Test +test_double_unescape = + TestCase $ assertEqual "The status text is unescaped twice." expected_text actual_text + where + actual_text = (replace_entities "As a kid, I'd pull a girl's hair to let her know I liked her, but now that I'm older &amp; wiser I simply hit her with my car.") + expected_text = "As a kid, I'd pull a girl's hair to let her know I liked her, but now that I'm older & wiser I simply hit her with my car."