Thread the Cfg object through the recursive loop, simplifying things greatly.

[dead/halcyon.git] / src / Twitter / Xml.hs
diff --git a/src/Twitter/Xml.hs b/src/Twitter/Xml.hs

index b34c6fbae94c4ca8629eec48ad849becb98c23e8..0e23cb7e736905dcac1afc03bfd8c18f3cbd6341 100644 (file)
--- a/src/Twitter/Xml.hs
+++ b/src/Twitter/Xml.hs
@@ -2,49 +2,82 @@
  module Twitter.Xml
  where
  
-import Data.Maybe
+import Data.Char (chr)
  import Test.HUnit
-import Text.Regex (mkRegex, subRegex)
+import Text.Regex (matchRegex, mkRegex, subRegex)
  import Text.XML.HaXml
  
  -- |Returns the 'CharData' contained within the given 'Content', or
  -- 'Nothing' if no acceptable CharData was found. It will parse either
  -- a 'CString' ('String') or 'CRef' (XML entity reference).
-get_char_data :: Content -> (Maybe CharData)
-get_char_data (CString _ cd) = Just cd
-get_char_data (CRef ref) = Just (verbatim ref) -- Entities.
+get_char_data :: Content i -> (Maybe CharData)
+get_char_data (CString _ cd _) = Just cd
+get_char_data (CRef ref _) = Just (verbatim ref) -- Entities.
  get_char_data _ = Nothing
  
  
+-- |A 'CFilter' returning all top-level <status> elements.
+-- The name is due to the fact that if we retrieve more than
+-- one status, they will be wrapped in a <statuses> tag, and
+-- thus not be top-level.
+single_status :: CFilter i
+single_status = (tag "status")
+
  -- |A 'CFilter' returning all <status> tags within <statuses>.
-all_statuses :: CFilter
+all_statuses :: CFilter i
  all_statuses = (tag "statuses" /> tag "status")
  
  -- |Finds the text of the <id> element contained within some other
  -- content. Called unique_id here because status_id is used elsewhere.
-unique_id :: CFilter
+unique_id :: CFilter i
  unique_id = keep /> (tag "id") /> txt
  
  -- |Finds the text of the <created_at> element contained within some
  -- other element.
-status_created_at :: CFilter
+status_created_at :: CFilter i
  status_created_at = keep /> (tag "created_at") /> txt
  
  -- |Finds the text of the <text> element contained within some
  -- other element.
-status_text :: CFilter
+status_text :: CFilter i
  status_text = keep /> (tag "text") /> txt
  
  -- |Finds the XML of the <user> element contained within some other
  -- element.
-status_user :: CFilter
+status_user :: CFilter i
  status_user = keep /> (tag "user")
  
+-- | Finds the text of the <retweeted> element contained within some
+--   other element.
+status_retweeted :: CFilter i
+status_retweeted = keep /> (tag "retweeted") /> txt
+
+-- | Finds the text of the <in_reply_to_status_id> element contained
+--   within some other element.
+status_reply_to_status_id :: CFilter i
+status_reply_to_status_id = keep /> (tag "in_reply_to_status_id") /> txt
+
  -- |Finds the text of the <screen_name> element contained within some
  -- other element.
-user_screen_name :: CFilter
+user_screen_name :: CFilter i
  user_screen_name = keep /> (tag "screen_name") /> txt
  
+-- |A wrapper around the 'read' function which returns either Nothing
+-- or (Just <the thing that could be read>).
+maybe_read :: (Read a) => String -> Maybe a
+maybe_read str =
+    case (reads str) of
+      [] -> Nothing
+      ((y,_):_) -> Just y
+
+-- |Takes a unicode codepoint in decimal and returns it as a
+-- one-character string.
+entity_from_codepoint :: String -> String
+entity_from_codepoint codepoint =
+    case (maybe_read codepoint) of
+      Nothing  -> ""
+      Just num -> [(chr num)]
+
  
  -- |A list of tuples whose first entry is a regular expression
  -- matching XML entities, and whose second entry is the ASCII
@@ -54,16 +87,34 @@ xml_entities = [("[lr]dquo", "\""),
                  ("quot",     "\""),
                  ("[mn]dash", "-"),
                  ("nbsp",     " "),
-                ("#8217",    "'"),
                  ("amp",      "&"),
                  ("lt",       "<"),
                  ("gt",       ">"),
-                ("#8230",    "..."),
-                ("hellip",   "...")]
+                ("hellip",   "…")]
  
  -- |Replace all of the XML entities in target.
  replace_entities :: String -> String
-replace_entities target = unescape_recursive xml_entities target
+replace_entities target =
+    unescape_numeric (unescape_recursive xml_entities target)
+
+-- |Recursively unescape all numeric entities in the given String.
+unescape_numeric :: String -> String
+unescape_numeric target =
+    case match of
+      Nothing -> target
+      Just subexprs ->
+          case subexprs of
+            []   -> target
+            s1:_ ->
+                let this_entity_regex = mkRegex ("&#" ++ s1 ++ ";") in
+                let replacement = entity_from_codepoint s1 in
+                let new_target = subRegex this_entity_regex target replacement in
+                unescape_numeric new_target
+    where
+      from = "&#([0-9]+);"
+      match = matchRegex (mkRegex from) target
+
+
  
  -- |The recursive function which does the real work for
  -- 'replace_entities'.
@@ -87,4 +138,4 @@ test_replace_entities =
      TestCase $ assertEqual "All entities are replaced correctly." expected_text actual_text
      where
        actual_text = (replace_entities "&quot;The moon is gay&#8230;&hellip;&quot; said &lt;insert the current president of the United States of America&gt;. &ldquo;It&#8217;s OK&mdash;&ndash;he&#8217;s not a real doctor.&rdquo;")
-      expected_text = "\"The moon is gay......\" said <insert the current president of the United States of America>. \"It's OK--he's not a real doctor.\""
+      expected_text = "\"The moon is gay……\" said <insert the current president of the United States of America>. \"It’s OK--he’s not a real doctor.\""