From: Michael Orlitzky Date: Mon, 25 Oct 2010 08:13:12 +0000 (-0400) Subject: Handle all numeric entities by converting them to UTF-8 rather than hard-coding each... X-Git-Url: https://gitweb.michael.orlitzky.com/?a=commitdiff_plain;h=ee2deb5a51e416d607cce45ddd10d4e19c050771;p=dead%2Fhalcyon.git Handle all numeric entities by converting them to UTF-8 rather than hard-coding each conversion. --- diff --git a/src/Twitter/Xml.hs b/src/Twitter/Xml.hs index 1b1ea48..e284a0d 100644 --- a/src/Twitter/Xml.hs +++ b/src/Twitter/Xml.hs @@ -2,8 +2,9 @@ module Twitter.Xml where +import Data.Char (chr) import Test.HUnit -import Text.Regex (mkRegex, subRegex) +import Text.Regex (matchRegex, mkRegex, subRegex) import Text.XML.HaXml -- |Returns the 'CharData' contained within the given 'Content', or @@ -51,6 +52,22 @@ status_user = keep /> (tag "user") user_screen_name :: CFilter user_screen_name = keep /> (tag "screen_name") /> txt +-- |A wrapper around the 'read' function which returns either Nothing +-- or (Just ). +maybe_read :: (Read a) => String -> Maybe a +maybe_read str = + case (reads str) of + [] -> Nothing + ((y,_):_) -> Just y + +-- |Takes a unicode codepoint in decimal and returns it as a +-- one-character string. +entity_from_codepoint :: String -> String +entity_from_codepoint codepoint = + case (maybe_read codepoint) of + Nothing -> "" + Just num -> [(chr num)] + -- |A list of tuples whose first entry is a regular expression -- matching XML entities, and whose second entry is the ASCII @@ -60,16 +77,34 @@ xml_entities = [("[lr]dquo", "\""), ("quot", "\""), ("[mn]dash", "-"), ("nbsp", " "), - ("#8217", "'"), ("amp", "&"), ("lt", "<"), ("gt", ">"), - ("#8230", "..."), - ("hellip", "...")] + ("hellip", "…")] -- |Replace all of the XML entities in target. replace_entities :: String -> String -replace_entities target = unescape_recursive xml_entities target +replace_entities target = + unescape_numeric (unescape_recursive xml_entities target) + +-- |Recursively unescape all numeric entities in the given String. +unescape_numeric :: String -> String +unescape_numeric target = + case match of + Nothing -> target + Just subexprs -> + case subexprs of + [] -> target + s1:_ -> + let this_entity_regex = mkRegex ("&#" ++ s1 ++ ";") in + let replacement = entity_from_codepoint s1 in + let new_target = subRegex this_entity_regex target replacement in + unescape_numeric new_target + where + from = "&#([0-9]+);" + match = matchRegex (mkRegex from) target + + -- |The recursive function which does the real work for -- 'replace_entities'. @@ -93,4 +128,4 @@ test_replace_entities = TestCase $ assertEqual "All entities are replaced correctly." expected_text actual_text where actual_text = (replace_entities ""The moon is gay……" said <insert the current president of the United States of America>. “It’s OK—–he’s not a real doctor.”") - expected_text = "\"The moon is gay......\" said . \"It's OK--he's not a real doctor.\"" + expected_text = "\"The moon is gay……\" said . \"It’s OK--he’s not a real doctor.\""