X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=src%2FTSN%2FPicklers.hs;h=3135d6ae194d483f3704755d46063cc513393808;hb=f0425854304197ab5ad47293b27b2e0b188cb844;hp=d87139b793bbb6dddc2e429a0c14b56fb96f3ccc;hpb=1f260c118e8da5679820c8cfa489d8fe4a521140;p=dead%2Fhtsn-import.git diff --git a/src/TSN/Picklers.hs b/src/TSN/Picklers.hs index d87139b..3135d6a 100644 --- a/src/TSN/Picklers.hs +++ b/src/TSN/Picklers.hs @@ -1,66 +1,633 @@ +{-# LANGUAGE ScopedTypeVariables #-} + -- | (Un)picklers for data types present in The Sports Network XML -- feed. -- module TSN.Picklers ( + pickler_tests, + xp_ambiguous_time, + xp_attr_option, xp_date, - xp_team_id, - xp_time ) + xp_date_padded, + xp_datetime, + xp_early_line_date, + xp_earnings, + xp_fracpart_only_double, + xp_gamedate, + xp_tba_time, + xp_time, + xp_time_dots, + xp_time_stamp ) where -- System imports. +import Data.Char ( toUpper ) +import Data.List ( intercalate ) +import Data.List.Split ( chunksOf ) +import Data.Maybe ( catMaybes, listToMaybe ) +import Data.String.Utils ( replace ) import Data.Time.Clock ( UTCTime ) import Data.Time.Format ( formatTime, parseTime ) -import System.Locale ( defaultTimeLocale ) +import Data.Tree.NTree.TypeDefs ( NTree(..) ) +import System.Locale ( TimeLocale( wDays, months ), defaultTimeLocale ) +import Test.Tasty ( TestTree, testGroup ) +import Test.Tasty.HUnit ( (@?=), testCase ) +import Text.Read ( readMaybe ) import Text.XML.HXT.Arrow.Pickle ( xpText, + xpWrap, xpWrapMaybe ) import Text.XML.HXT.Arrow.Pickle.Xml ( PU ) +import Text.XML.HXT.Core ( + XmlTree, + XNode( XTag, XText ), + mkName, + pickleDoc, + unpickleDoc ) + +-- Local imports. +import TSN.Parse ( + parse_time_stamp, + time_format, + time_stamp_format ) + + +-- | The format string for a base date in m/d/yyyy format. The +-- day/month are not padded at all. This will match for example, +-- +-- * 2\/15\/1983 +-- +-- * 1\/1\/0000 +-- +date_format :: String +date_format = "%-m/%-d/%Y" + + +-- | The format string for a base date in mm/dd/yyyy format. The +-- day/month are padded to two characters with zeros. This will +-- match for example, +-- +-- * 02\/15\/1983 +-- +-- * 01\/01\/0000 +-- +date_format_padded :: String +date_format_padded = "%0m/%0d/%Y" -- | (Un)pickle a UTCTime without the time portion. -- +-- /Examples/: +-- +-- This should parse: +-- +-- >>> let tn = text_node "2/15/1983" +-- >>> unpickleDoc xp_date tn +-- Just 1983-02-15 00:00:00 UTC +-- +-- But for some reason, it can also parse a leading zero in the +-- month. Whatever. This isn't required behavior. +-- +-- >>> let tn = text_node "02/15/1983" +-- >>> unpickleDoc xp_date tn +-- Just 1983-02-15 00:00:00 UTC +-- xp_date :: PU UTCTime xp_date = (to_date, from_date) `xpWrapMaybe` xpText where - format = "%-m/%-d/%Y" + to_date :: String -> Maybe UTCTime + to_date = parseTime defaultTimeLocale date_format + from_date :: UTCTime -> String + from_date = formatTime defaultTimeLocale date_format + + +-- | (Un)pickle a UTCTime without the time portion. The day/month are +-- padded to two characters with zeros. +-- +-- Examples: +-- +-- >>> let tn = text_node "02/15/1983" +-- >>> unpickleDoc xp_date_padded tn +-- Just 1983-02-15 00:00:00 UTC +-- +-- >>> let tn = text_node "06/07/2014" +-- >>> unpickleDoc xp_date_padded tn +-- Just 2014-06-07 00:00:00 UTC +-- +xp_date_padded :: PU UTCTime +xp_date_padded = + (to_date, from_date) `xpWrapMaybe` xpText + where to_date :: String -> Maybe UTCTime - to_date = parseTime defaultTimeLocale format + to_date = parseTime defaultTimeLocale date_format_padded from_date :: UTCTime -> String - from_date = formatTime defaultTimeLocale format + from_date = formatTime defaultTimeLocale date_format_padded + + + +-- | Format a number as a string using a comma as the thousands +-- separator. +-- +-- Examples: +-- +-- >>> format_commas 0 +-- "0" +-- >>> format_commas 10 +-- "10" +-- >>> format_commas 100 +-- "100" +-- >>> format_commas 1000 +-- "1,000" +-- >>> format_commas 10000 +-- "10,000" +-- >>> format_commas 100000 +-- "100,000" +-- >>> format_commas 1000000 +-- "1,000,000" +-- +format_commas :: Int -> String +format_commas x = + reverse (intercalate "," $ chunksOf 3 $ reverse $ show x) + + + +-- | Parse \ from an 'AutoRaceResultsListing'. These are +-- essentially 'Int's, but they look like, +-- +-- * \336,826\ +-- +-- * \1,000,191\ +-- +-- * \TBA\ +-- +-- Examples: +-- +-- >>> let tn = text_node "1,000,191" +-- >>> unpickleDoc xp_earnings tn +-- Just (Just 1000191) +-- +-- >>> let tn = text_node "TBA" +-- >>> unpickleDoc xp_earnings tn +-- Just Nothing +-- +xp_earnings :: PU (Maybe Int) +xp_earnings = + (to_earnings, from_earnings) `xpWrap` xpText + where + strip_commas :: String -> String + strip_commas = replace "," "" + + to_earnings :: String -> Maybe Int + to_earnings s + | s == "TBA" = Nothing + | otherwise = Just $ (read . strip_commas) s + + from_earnings :: Maybe Int -> String + from_earnings Nothing = "TBA" + from_earnings (Just i) = format_commas i + + + +-- | Pickle a 'Double' that can be missing its leading zero (for +-- values less than one). For example, we've seen, +-- +-- 0.5 +-- +-- Which 'xpPrim' can't handle without the leading +-- zero. Unfortunately there's no way pickle/unpickle can be +-- inverses of each other here, since \"0.5\" and \".5\" should +-- unpickle to the same 'Double'. +-- +-- Examples: +-- +-- >>> let tn = text_node "0.5" +-- >>> unpickleDoc xp_fracpart_only_double tn +-- Just 0.5 +-- +-- >>> let tn = text_node ".5" +-- >>> unpickleDoc xp_fracpart_only_double tn +-- Just 0.5 +-- +-- >>> let tn = text_node "foo" +-- >>> unpickleDoc xp_fracpart_only_double tn +-- Nothing +-- +xp_fracpart_only_double :: PU Double +xp_fracpart_only_double = + (to_double, from_double) `xpWrapMaybe` xpText + where + -- | Convert a 'String' to a 'Double', maybe. We always prepend a + -- zero, since it will fix the fraction-only values, and not hurt + -- the ones that already have a leading integer. + to_double :: String -> Maybe Double + to_double s = readMaybe ("0" ++ s) + + from_double :: Double -> String + from_double = show + + + +-- | (Un)pickle an unpadded 'UTCTime'. Used for example on the +-- \ elements in an 'AutoRaceResults' message. +-- +-- Examples: +-- +-- >>> let tn = text_node "6/1/2014 1:00:00 PM" +-- >>> unpickleDoc xp_datetime tn +-- Just 2014-06-01 13:00:00 UTC +-- +-- >>> let tn = text_node "5/24/2014 2:45:00 PM" +-- >>> unpickleDoc xp_datetime tn +-- Just 2014-05-24 14:45:00 UTC +-- +-- Padded! For some reason it works with only one zero in front. I +-- dunno man. NOT required (or even desired?) behavior. +-- +-- >>> let tn = text_node "05/24/2014 2:45:00 PM" +-- >>> unpickleDoc xp_datetime tn +-- Just 2014-05-24 14:45:00 UTC +-- +xp_datetime :: PU UTCTime +xp_datetime = + (to_datetime, from_datetime) `xpWrapMaybe` xpText + where + format = date_format ++ " " ++ "%-I:%M:%S %p" + + to_datetime :: String -> Maybe UTCTime + to_datetime = parseTime defaultTimeLocale format + + from_datetime :: UTCTime -> String + from_datetime = formatTime defaultTimeLocale format + + + +-- | Takes a 'UTCTime', and returns the English suffix that would be +-- appropriate after the day of the month. For example, if we have a +-- UTCTime representing Christmas, this would return \"th\" because +-- \"th\" is the right suffix of \"December 25th\". +-- +-- Examples: +-- +-- >>> import Data.Maybe ( fromJust ) +-- >>> :{ +-- let parse_date :: String -> Maybe UTCTime +-- parse_date = parseTime defaultTimeLocale date_format +-- :} +-- +-- >>> let dates = [ "1/" ++ (d : "/1970") | d <- ['1'..'9'] ] +-- >>> let suffixes = map (date_suffix . fromJust . parse_date) dates +-- >>> suffixes +-- ["st","nd","rd","th","th","th","th","th","th"] +-- +date_suffix :: UTCTime -> String +date_suffix t = + case (reverse daystr) of + [] -> [] + ('1':_) -> "st" + ('2':_) -> "nd" + ('3':_) -> "rd" + _ -> "th" + where + daystr = formatTime defaultTimeLocale "%d" t + + +-- | (Un)pickle a UTCTime from a weather forecast's gamedate. Example +-- input looks like, +-- +-- When unpickling we get rid of the suffixes \"st\", \"nd\", \"rd\", and +-- \"th\". During pickling, we add them back based on the last digit +-- of the date. +-- +-- Examples: +-- +-- >>> let tn = text_node "Monday, December 30th" +-- >>> let (Just gd) = unpickleDoc xp_gamedate tn +-- >>> gd +-- 1970-12-30 00:00:00 UTC +-- >>> pickleDoc xp_gamedate gd +-- NTree (XTag "/" []) [NTree (XText "Wednesday, December 30th") []] +-- +xp_gamedate :: PU UTCTime +xp_gamedate = + (to_gamedate, from_gamedate) `xpWrapMaybe` xpText + where + format = "%A, %B %-d" + + to_gamedate :: String -> Maybe UTCTime + to_gamedate s = + parseTime defaultTimeLocale format s' + where + s' = case (reverse s) of + (c2:c1:cs) -> let suffix = [c1,c2] + in + if suffix `elem` ["st","nd","rd","th"] + then reverse cs + else s -- Unknown suffix, leave it alone. + + _ -> s -- The String is less than two characters long, + -- leave it alone. + + + from_gamedate :: UTCTime -> String + from_gamedate d = s ++ (date_suffix d) + where + s = formatTime defaultTimeLocale format d --- | (Un)pickle a UTCTime without the date portion. + + + + + +-- | (Un)pickle a UTCTime without the date portion. Doesn't work if +-- the fields aren't zero-padded to two characters. +-- +-- /Examples/: +-- +-- Padded, should work: +-- +-- >>> let tn = text_node "04:35 PM" +-- >>> unpickleDoc xp_time tn +-- Just 1970-01-01 16:35:00 UTC +-- +-- Unpadded, should fail: +-- +-- >>> let tn = text_node "4:35 PM" +-- >>> unpickleDoc xp_time tn +-- Nothing -- xp_time :: PU UTCTime xp_time = (to_time, from_time) `xpWrapMaybe` xpText where - format = "%I:%M %p" + to_time :: String -> Maybe UTCTime + to_time = parseTime defaultTimeLocale time_format + + from_time :: UTCTime -> String + from_time = formatTime defaultTimeLocale time_format + + +-- | (Un)pickle a UTCTime without the date portion. This differs from +-- 'xp_time' in that it uses periods in the AM/PM part, i.e. \"A.M.\" +-- and \"P.M.\" It also doesn't use padding for the \"hours\" part. +-- +-- /Examples/: +-- +-- A standard example of the correct form: +-- +-- >>> let tn = text_node "11:30 A.M." +-- >>> let (Just result) = unpickleDoc xp_time_dots tn +-- >>> result +-- 1970-01-01 11:30:00 UTC +-- >>> pickleDoc xp_time_dots result +-- NTree (XTag "/" []) [NTree (XText "11:30 A.M.") []] +-- +-- Another miracle, it still parses with a leading zero! +-- +-- >>> let tn = text_node "01:30 A.M." +-- >>> unpickleDoc xp_time_dots tn +-- Just 1970-01-01 01:30:00 UTC +-- +xp_time_dots :: PU UTCTime +xp_time_dots = + (to_time, from_time) `xpWrapMaybe` xpText + where + -- | The hours arent padded with zeros. + nopad_time_format :: String + nopad_time_format = "%-I:%M %p" to_time :: String -> Maybe UTCTime - to_time = parseTime defaultTimeLocale format + to_time = (parseTime defaultTimeLocale nopad_time_format) . (replace "." "") from_time :: UTCTime -> String - from_time = formatTime defaultTimeLocale format + from_time t = + replace "AM" "A.M." (replace "PM" "P.M." s) + where + s = formatTime defaultTimeLocale nopad_time_format t +-- | (Un)pickle a UTCTime without the date portion, allowing for a +-- value of \"TBA\" (which gets translated to 'Nothing'). +-- +-- /Examples/: +-- +-- A failed parse will return 'Nothing': +-- +-- >>> let tn = text_node "YO" +-- >>> unpickleDoc xp_tba_time tn +-- Just Nothing +-- +-- And so will parsing a \"TBA\": +-- +-- >>> let tn = text_node "TBA" +-- >>> unpickleDoc xp_tba_time tn +-- Just Nothing +-- +-- But re-pickling 'Nothing' gives only \"TBA\": +-- +-- >>> pickleDoc xp_tba_time Nothing +-- NTree (XTag "/" []) [NTree (XText "TBA") []] +-- +-- A normal time is also parsed successfully, of course: +-- +-- >>> let tn = text_node "08:10 PM" +-- >>> unpickleDoc xp_tba_time tn +-- Just (Just 1970-01-01 20:10:00 UTC) +-- +xp_tba_time :: PU (Maybe UTCTime) +xp_tba_time = + (to_time, from_time) `xpWrap` xpText + where + to_time :: String -> Maybe UTCTime + to_time s + | s == "TBA" = Nothing + | otherwise = parseTime defaultTimeLocale time_format s + + from_time :: Maybe UTCTime -> String + from_time Nothing = "TBA" + from_time (Just t) = formatTime defaultTimeLocale time_format t + --- | Parse a team_id. These are (so far!) three characters long, and --- not necessarily numeric. For simplicity, we return a 'String' --- rather than e.g. a @(Char, Char, Char)@. But unpickling will fail --- if the team_id is longer than three characters. + +-- | (Un)pickle the \ element format to/from a 'UTCTime'. +-- The time_stamp elements look something like, +-- +-- \ January 6, 2014, at 10:11 PM ET \ +-- +-- TSN doesn't provide a proper time zone name, only \"ET\" for +-- \"Eastern Time\". But \"Eastern Time\" changes throughout the +-- year, depending on one's location, for daylight-savings +-- time. It's really not any more useful to be off by one hour than +-- it is to be off by 5 hours, so rather than guess at EDT/EST, we +-- just store the timestamp as UTC. -- -xp_team_id :: PU String -xp_team_id = - (to_team_id, from_team_id) `xpWrapMaybe` xpText +-- Examples: +-- +-- >>> let tn = text_node " January 6, 2014, at 10:11 PM ET " +-- >>> let (Just tstamp) = unpickleDoc xp_time_stamp tn +-- >>> tstamp +-- 2014-01-06 22:11:00 UTC +-- >>> pickleDoc xp_time_stamp tstamp +-- NTree (XTag "/" []) [NTree (XText " January 6, 2014, at 10:11 PM ET ") []] +-- +xp_time_stamp :: PU UTCTime +xp_time_stamp = + (parse_time_stamp, from_time_stamp) `xpWrapMaybe` xpText where - to_team_id :: String -> Maybe String - to_team_id s - | length s <= 3 = Just s - | otherwise = Nothing + -- | We have to re-pad the time_stamp_format with a leading and + -- trailing space; see the documentation of 'time_stamp_format' + -- for more information. + from_time_stamp :: UTCTime -> String + from_time_stamp = + formatTime defaultTimeLocale (" " ++ time_stamp_format ++ " ") + - from_team_id :: String -> String - from_team_id = id + +-- | (Un)pickle an ambiguous 12-hour AM/PM time, which is ambiguous +-- because it's missing the AM/PM part. +-- +-- Examples: +-- +-- >>> let tn = text_node "8:00" +-- >>> unpickleDoc xp_ambiguous_time tn +-- Just 1970-01-01 08:00:00 UTC +-- +xp_ambiguous_time :: PU UTCTime +xp_ambiguous_time = + (to_time, from_time) `xpWrapMaybe` xpText + where + ambiguous_time_format :: String + ambiguous_time_format = "%-I:%M" + + to_time :: String -> Maybe UTCTime + to_time = parseTime defaultTimeLocale ambiguous_time_format + + from_time :: UTCTime -> String + from_time = + formatTime defaultTimeLocale ambiguous_time_format + + +-- | Pickle a date value from a \ element as they appear in the +-- early lines. This is a particularly wacky format, but then so is +-- the associated time (see 'xp_ambiguous_time'). +-- +-- Examples: +-- +-- >>> let tn = text_node "SUNDAY, MAY 25TH (05/25/2014)" +-- >>> let (Just result) = unpickleDoc xp_early_line_date tn +-- >>> result +-- 2014-05-25 00:00:00 UTC +-- >>> pickleDoc xp_early_line_date result +-- NTree (XTag "/" []) [NTree (XText "SUNDAY, MAY 25TH (05/25/2014)") []] +-- +-- >>> let tn = text_node "SATURDAY, JUNE 7TH (06/07/2014)" +-- >>> let (Just result) = unpickleDoc xp_early_line_date tn +-- >>> result +-- 2014-06-07 00:00:00 UTC +-- >>> pickleDoc xp_early_line_date result +-- NTree (XTag "/" []) [NTree (XText "SATURDAY, JUNE 7TH (06/07/2014)") []] +-- +xp_early_line_date :: PU UTCTime +xp_early_line_date = + (to_time, from_time) `xpWrapMaybe` xpText + where + -- | We need to create our own time locale that talks IN ALL CAPS. + -- Actually, 'parseTime' doesn't seem to care about the + -- case. But when we spit it back out again ('formatTime'), + -- we'll want it to be in all caps. + -- + caps_time_locale :: TimeLocale + caps_time_locale = + defaultTimeLocale { wDays = caps_days, months = caps_months } + + caps_days :: [(String,String)] + caps_days = map both_to_upper (wDays defaultTimeLocale) + + caps_months :: [(String,String)] + caps_months = map both_to_upper (months defaultTimeLocale) + + both_to_upper :: (String,String) -> (String,String) + both_to_upper (s1,s2) = (map toUpper s1, map toUpper s2) + + wacko_date_formats :: [String] + wacko_date_formats = + ["%A, %B %-d" ++ suffix ++ " (" ++ date_format_padded ++ ")" | + suffix <- ["ST", "ND", "RD","TH"] ] + + to_time :: String -> Maybe UTCTime + to_time s = + listToMaybe $ catMaybes possible_parses + where + possible_parses = [ parseTime caps_time_locale fmt s | + fmt <- wacko_date_formats ] + + from_time :: UTCTime -> String + from_time t = + formatTime caps_time_locale fmt t + where + upper_suffix = map toUpper (date_suffix t) + fmt = "%A, %B %-d" ++ upper_suffix ++ " (" ++ date_format_padded ++ ")" + + +-- | This is a replacement for @xpOption xpFoo@ within an 'xpAttr'. +-- There's a bug in +-- newer versions of HXT that prevents us from using the usual +-- 'xpOption' solution, so this is our stopgap. It should work on +-- any type that can be unpickled with a plain read/show. +-- +xp_attr_option :: forall a. (Read a, Show a) => PU (Maybe a) +xp_attr_option = + (to_a, from_a) `xpWrap` xpText + where + to_a :: String -> Maybe a + to_a = readMaybe + + from_a :: Maybe a -> String + from_a Nothing = "" + from_a (Just x) = show x + + +-- | Create an 'XmlTree' containing only the given text. This is +-- useful for testing (un)picklers, where we don't want to have to +-- bother to create a dummy XML document. +-- +-- Examples: +-- +-- >>> text_node "8:00" +-- NTree (XText "8:00") [] +-- +text_node :: String -> XmlTree +text_node s = NTree (XText s) [] + + + +-- +-- * Tasty Tests +-- + +-- | A list of all tests for this module. This primary exists to +-- eliminate the unused import/export warnings for 'unpickleDoc' and +-- 'text_node' which are otherwise only used in the doctests. +-- +pickler_tests :: TestTree +pickler_tests = + testGroup + "Pickler tests" + [ test_pickle_of_unpickle_is_identity ] + + +-- | If we unpickle something and then pickle it, we should wind up +-- with the same thing we started with (plus an additional root +-- element). +-- +test_pickle_of_unpickle_is_identity :: TestTree +test_pickle_of_unpickle_is_identity = + testCase "pickle composed with unpickle is (almost) the identity" $ do + let tn = text_node "8:00" + let (Just utctime) = unpickleDoc xp_ambiguous_time tn + let actual = pickleDoc xp_ambiguous_time utctime + let expected = NTree (XTag (mkName "/") []) [tn] + actual @?= expected