From: Michael Orlitzky Date: Wed, 30 Jul 2014 07:27:43 +0000 (-0400) Subject: Add game_id/schedule_id parsers to TSN.Parse. X-Git-Tag: 0.1.1~9 X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Fhtsn-import.git;a=commitdiff_plain;h=c8ec4174a46c44215ef9540a9b19b99323fb0717 Add game_id/schedule_id parsers to TSN.Parse. Define a new ParseError type to be returned from the TSN.Parse parsers. Add tests for the TSN.Parse module. Update SportInfo with the new ParseError type signature. Add an optional game_id and required schedule_id field to GameInfo. --- diff --git a/src/Main.hs b/src/Main.hs index 59da419..0682f8e 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -45,6 +45,7 @@ import Network.Services.TSN.Report ( report_info, report_error ) import TSN.DbImport ( DbImport(..), ImportResult(..) ) +import TSN.Parse ( format_parse_error ) import qualified TSN.XML.AutoRacingResults as AutoRacingResults ( dtd, pickle_message ) @@ -250,7 +251,7 @@ import_file cfg path = do case either_m of -- This might give us a slightly better error -- message than the default 'errmsg'. - Left err -> return $ ImportFailed err + Left err -> return $ ImportFailed (format_parse_error err) Right m -> migrate_and_import m | dtd `elem` SportInfo.dtds = do @@ -258,7 +259,7 @@ import_file cfg path = do case either_m of -- This might give us a slightly better error -- message than the default 'errmsg'. - Left err -> return $ ImportFailed err + Left err -> return $ ImportFailed (format_parse_error err) Right m -> migrate_and_import m | otherwise = do diff --git a/src/TSN/Parse.hs b/src/TSN/Parse.hs index 54175d7..9fe5125 100644 --- a/src/TSN/Parse.hs +++ b/src/TSN/Parse.hs @@ -1,5 +1,10 @@ module TSN.Parse ( + ParseError, + format_parse_error, + parse_game_id, parse_message, + parse_schedule_id, + parse_tests, parse_time_stamp, parse_xml_time_stamp, parse_xmlfid, @@ -11,6 +16,8 @@ import Data.Either.Utils ( maybeToEither ) import Data.Time.Clock ( UTCTime ) import Data.Time.Format ( parseTime ) import System.Locale ( defaultTimeLocale ) +import Test.Tasty ( TestTree, testGroup ) +import Test.Tasty.HUnit ( (@?=), testCase ) import Text.Read ( readMaybe ) import Text.XML.HXT.Core ( XmlTree, @@ -21,17 +28,49 @@ import Text.XML.HXT.Core ( hasName, runLA ) +-- Local imports +import Xml ( unsafe_read_document ) --- | Parse the \"message\" element out of a document tree and return --- it as an 'XmlTree'. We use an Either for consistency. + +-- | When parsing an element from an XML document (like the +-- XML_File_ID), there are a few things that can happen. First of +-- all, it can work. Good for you. +-- +-- Or, you may find nothing. Like, the element is missing. We +-- represent that with a 'ParseNotFound' containing the name of +-- thing thing not-found as a 'String'. -- --- Note: It's more trouble than it's worth to attempt to use this as --- the basis for parse_xmlfid and parse_xml_time_stamp. +-- Finally, you could find something, but be unable to interpret it +-- as the type you were expecting. For example, if you parse +-- \"WHATSUP\" out of a \ which is supposed to contain +-- integers. We represent this case with a 'ParseMismatch' +-- containing the name of the thing that you were looking for, the +-- value that had the unexpected type, and finally the name of the +-- expected type (used in error messages). -- -parse_message :: XmlTree -> Either String XmlTree +data ParseError = + ParseNotFound String | ParseMismatch String String String + deriving (Eq, Show) + + +-- | Take a 'ParseError' and turn it into a human-readable description +-- of the problem. +-- +format_parse_error :: ParseError -> String +format_parse_error (ParseNotFound item) = + "No " ++ item ++ " elements found." +format_parse_error (ParseMismatch item val expected_type) = + "Could not parse " ++ item ++ " " ++ val ++ " as " ++ expected_type ++ "." + + + +-- | Parse the \"message\" element out of a document tree and return +-- it as an 'XmlTree'. We use an 'Either' for consistency. +-- +parse_message :: XmlTree -> Either ParseError XmlTree parse_message xmltree = case elements of - [] -> Left "No message elements found." + [] -> Left $ ParseNotFound "message" (x:_) -> Right x where parse :: XmlTree -> [XmlTree] @@ -41,46 +80,108 @@ parse_message xmltree = --- | Extract the \"XML_File_ID\" element from a document. If we fail --- to parse an XML_File_ID, we return the reason wrapped in a 'Left' --- constructor. The reason should be one of two things: --- --- 1. No XML_File_ID elements were found. --- --- 2. An XML_File_ID element was found, but it could not be read --- into an Integer. --- --- We use an Either rather than a Maybe because we do expect some --- non-integer XML_File_IDs. In the examples, you will see --- NHL_DepthChart_XML.XML with an XML_File_ID of \"49618.61\" and --- CFL_Boxscore_XML1.xml with an XML_File_ID of --- \"R28916\". According to Brijesh Patel of TSN, these are special --- category files and not part of the usual feed. + + +-- | Parse an 'Int' from a direct descendent of the (top-level) +-- \ element in an XmlTree. This is used to implement the +-- XML_File_ID, game_id, and schedule_id (the last two are specific +-- to "TSN.XML.GameInfo") parsers. -- --- TODO: This should eventually be combined with XML.parse_xmlfid --- from the htsn package. +-- If the parse fails, we return the corresponding 'ParseError' +-- wrapped in a 'Left'. Otherwise the parsed value is returned in a +-- 'Right'. -- -parse_xmlfid :: XmlTree -> Either String Integer -parse_xmlfid xmltree = +parse_message_int :: String -> XmlTree -> Either ParseError Int +parse_message_int child xmltree = case parse_results of - [] -> Left "No XML_File_ID elements found." + [] -> Left $ ParseNotFound child (x:_) -> x where parse :: XmlTree -> [String] parse = runLA $ hasName "/" /> hasName "message" - /> hasName "XML_File_ID" + /> hasName child >>> getChildren >>> getText - read_either_integer :: String -> Either String Integer - read_either_integer s = - let msg = "Could not parse XML_File_ID " ++ s ++ " as an integer." - in - maybeToEither msg (readMaybe s) + read_either_int :: String -> Either ParseError Int + read_either_int s = + maybeToEither (ParseMismatch child s "integer") (readMaybe s) elements = parse xmltree - parse_results = map read_either_integer elements + parse_results = map read_either_int elements + + + +-- | Extract the \"XML_File_ID\" element from a document. If we fail +-- to parse an XML_File_ID, we return an appropriate 'ParseError' +-- wrapped in a 'Left' constructor. The reason should be one of two +-- things: +-- +-- 1. No XML_File_ID elements were found. +-- +-- 2. An XML_File_ID element was found, but it could not be read +-- into an Int. +-- +-- In general we expect some non-integer XML_File_IDs, because they +-- appear on the feed. But the htsn daemon refuses to save them at +-- the moment, so if we ever see an XML_File_ID that we can't parse, +-- it's truly an error. +-- +parse_xmlfid :: XmlTree -> Either ParseError Int +parse_xmlfid = parse_message_int "XML_File_ID" + + + +-- | Extract the \ element from within the top-level +-- \ of a document. These appear in the "TSN.XML.GameInfo" +-- documents. Unlike the \ and \ +-- elements, the \ can be missing from GameInfo +-- documents. So even the 'Right' value of the 'Either' can be +-- \"missing\". There are two reasons that the parse might fail. +-- +-- 1. No such elements were found. This is expected sometimes, and +-- should be returned as a 'Right' 'Nothing'. +-- +-- 2. An element was found, but it could not be read into an +-- 'Int'. This is NOT expected, and will be returned as a +-- 'ParseError', wrapped in a 'Left'. +-- +-- Most of implementation for this ('parse_message_int') is shared, +-- but to handle the fact that game_id is optional, we pattern match +-- on the 'ParseError' that comes back in case of failure. If we +-- didn't find any game_id elements, we turn that into a +-- \"successful nothing\". But if we find a game_id and it can't be +-- parsed, we let the error propagate, because that shouldn't +-- happen. Of course, if the parse worked, that's nice too: we wrap +-- the parsed value in a 'Just' and return that wrapped in a 'Right' +-- +parse_game_id :: XmlTree -> Either ParseError (Maybe Int) +parse_game_id xml = + case (parse_message_int "game_id" xml) of + Left (ParseNotFound _) -> Right Nothing + Left pm@(ParseMismatch {}) -> Left pm + Right whatever -> Right (Just whatever) + + + +-- | Extract the \ element from within the top-level +-- \ of a document. These appear in the +-- "TSN.XML.GameInfo" documents. If we fail to parse a schedule_id, +-- we return the reason wrapped in an appropriate 'ParseError'. The reason +-- should be one of two things: +-- +-- 1. No such elements were found. +-- +-- 2. An element was found, but it could not be read +-- into an Int. +-- +-- Both of these are truly errors in the case of schedule_id. The +-- implementation for this ('parse_message_int') is shared among a +-- few functions. +-- +parse_schedule_id :: XmlTree -> Either ParseError Int +parse_schedule_id = parse_message_int "schedule_id" @@ -90,6 +191,7 @@ time_format :: String time_format = "%I:%M %p" + -- | The format string for a time_stamp. We keep the leading/trailing -- space so that parseTime and formatTime are inverses are one -- another, even though there is some confusion as to how these two @@ -112,23 +214,23 @@ parse_time_stamp = --- | Extract the \"time_stamp\" element from a document. If we fail --- to parse a time_stamp, we return the reason wrapped in a 'Left' --- constructor. The reason should be one of two things: +-- | Extract the \"time_stamp\" element from a document. If we fail to +-- parse a time_stamp, we return an appropriate 'ParseError'. The +-- reason should be one of two things: -- -- 1. No time_Stamp elements were found. -- -- 2. A time_stamp element was found, but it could not be read -- into a UTCTime. -- --- Unline 'parse_xmlfid', we don't expect to run into any time_stamps --- that we can't parse. But since parse_xmlfid returns an Either, we --- do for consistency. +-- We don't expect to run into any time_stamps that we can't parse, +-- and they can never be missing, so both conditions are truly +-- errors. -- -parse_xml_time_stamp :: XmlTree -> Either String UTCTime +parse_xml_time_stamp :: XmlTree -> Either ParseError UTCTime parse_xml_time_stamp xmltree = case parse_results of - [] -> Left "No time_stamp elements found." + [] -> Left $ ParseNotFound "time_stamp" (x:_) -> x where parse :: XmlTree -> [String] @@ -138,11 +240,75 @@ parse_xml_time_stamp xmltree = >>> getChildren >>> getText - read_either_utctime :: String -> Either String UTCTime + read_either_utctime :: String -> Either ParseError UTCTime read_either_utctime s = - let msg = "Could not parse time_stamp " ++ s ++ " as a date/time." - in - maybeToEither msg (parse_time_stamp s) + maybeToEither (ParseMismatch "time_stamp" s "date/time") + (parse_time_stamp s) elements = parse xmltree parse_results = map read_either_utctime elements + + + +-- +-- * Tests +-- + +-- | A list of all tests for this module. +-- +parse_tests :: TestTree +parse_tests = + testGroup + "TSN.Parse tests" + [ test_parse_game_id, + test_parse_missing_game_id, + test_parse_schedule_id, + test_parse_xmlfid ] + where + sample_path :: String + sample_path = "test/xml/gameinfo/CBASK_Lineup_XML.xml" + + desc :: String -> String + desc child = "a known " ++ child ++ " is parsed correctly" + + + -- | Actual implementation of the test for parse_xmlfid, + -- parse_game_id, and parse_schedule_id. + -- + test_child :: String -> Int -> TestTree + test_child child expected = + testCase (desc child) $ do + xmltree <- unsafe_read_document sample_path + let actual = parse_message_int child xmltree + actual @?= (Right expected) + + + -- | Make sure we can parse a game_id into the expected value. + -- + test_parse_game_id :: TestTree + test_parse_game_id = test_child "game_id" 97865 + + + -- | Make sure we can parse a schedule_id (different from the + -- game_id) into the expected value. + -- + test_parse_schedule_id :: TestTree + test_parse_schedule_id = test_child "schedule_id" 10199 + + + -- | Make sure we can parse an XML_File_ID into the expected value. + -- + test_parse_xmlfid :: TestTree + test_parse_xmlfid = test_child "XML_File_ID" 17 + + + +-- | The game_id element can be missing, so we test that too. +-- +test_parse_missing_game_id :: TestTree +test_parse_missing_game_id = + testCase "missing game_id is not an error" $ do + xmltree <- unsafe_read_document "test/xml/gameinfo/MLB_Matchup_XML.xml" + let actual = parse_game_id xmltree + let expected = Right Nothing + actual @?= expected diff --git a/src/TSN/XML/GameInfo.hs b/src/TSN/XML/GameInfo.hs index 2b5e1ad..2830295 100644 --- a/src/TSN/XML/GameInfo.hs +++ b/src/TSN/XML/GameInfo.hs @@ -46,7 +46,10 @@ import TSN.DbImport ( ImportResult(..), run_dbmigrate ) import TSN.Parse ( + ParseError, + parse_game_id, parse_message, + parse_schedule_id, parse_xmlfid, parse_xml_time_stamp ) import Xml ( unsafe_read_document ) @@ -84,10 +87,23 @@ dtds = -- | This serves as both the database and XML representation of a -- GameInfo \. -- +-- The 'game_id' and 'schedule_id' fields are foreign keys, but they +-- key into multiple tables and key on records which may not exist +-- when we import the GameInfo document. We therefore don't declare +-- them as foreign keys; i.e. we don't require them to point +-- anywhere in particular. But if they do, that's nice. +-- data GameInfo = GameInfo { dtd :: String, xml_file_id :: Int, + game_id :: Maybe Int, -- ^ These are optional because they are missing + -- from at least the MLB_Matchup_XML.dtd documents. + -- They provide foreign keys into any tables storing + -- games with their IDs. + + schedule_id :: Int, -- ^ Required foreign key into any table storing a + -- schedule along with its ID. time_stamp :: UTCTime, xml :: String } deriving (Eq, Show) @@ -96,15 +112,23 @@ data GameInfo = -- | Attempt to parse a 'GameInfo' from an 'XmlTree'. If we cannot, -- we fail with an error message. -- -parse_xml :: String -> XmlTree -> Either String GameInfo +parse_xml :: String -> XmlTree -> Either ParseError GameInfo parse_xml dtdname xmltree = do xmlfid <- parse_xmlfid xmltree + game_id <- parse_game_id xmltree + schedule_id <- parse_schedule_id xmltree timestamp <- parse_xml_time_stamp xmltree message <- parse_message xmltree - return $ GameInfo dtdname (fromInteger xmlfid) timestamp (xshow [message]) + return $ GameInfo + dtdname + xmlfid + game_id + schedule_id + timestamp + (xshow [message]) -- --- Database code +-- * Database code -- instance DbImport GameInfo where @@ -163,10 +187,14 @@ test_accessors = testCase "we can access a parsed game_info" $ do let ex2 = 21201550 let a3 = show $ time_stamp t let ex3 = "2014-05-31 15:13:00 UTC" - let a4 = take 9 (xml t) - let ex4 = "" - let actual = (a1,a2,a3,a4) - let expected = (ex1,ex2,ex3,ex4) + let a4 = game_id t + let ex4 = Just 39978 + let a5 = schedule_id t + let ex5 = 39978 + let a6 = take 9 (xml t) + let ex6 = "" + let actual = (a1,a2,a3,a4,a5,a6) + let expected = (ex1,ex2,ex3,ex4,ex5,ex6) actual @?= expected diff --git a/src/TSN/XML/SportInfo.hs b/src/TSN/XML/SportInfo.hs index 6bb99a2..4661484 100644 --- a/src/TSN/XML/SportInfo.hs +++ b/src/TSN/XML/SportInfo.hs @@ -50,6 +50,7 @@ import TSN.DbImport ( ImportResult(..), run_dbmigrate ) import TSN.Parse ( + ParseError, parse_message, parse_xmlfid, parse_xml_time_stamp ) @@ -208,12 +209,12 @@ data SportInfo = -- | Attempt to parse a 'SportInfo' from an 'XmlTree'. If we cannot, -- we fail with an error message. -- -parse_xml :: String -> XmlTree -> Either String SportInfo +parse_xml :: String -> XmlTree -> Either ParseError SportInfo parse_xml dtdname xmltree = do xmlfid <- parse_xmlfid xmltree timestamp <- parse_xml_time_stamp xmltree message <- parse_message xmltree - return $ SportInfo dtdname (fromInteger xmlfid) timestamp (xshow [message]) + return $ SportInfo dtdname xmlfid timestamp (xshow [message]) -- diff --git a/test/TestSuite.hs b/test/TestSuite.hs index e82c2a0..b64d2d6 100644 --- a/test/TestSuite.hs +++ b/test/TestSuite.hs @@ -1,5 +1,6 @@ import Test.Tasty ( TestTree, defaultMain, testGroup ) +import TSN.Parse ( parse_tests ) import TSN.Picklers ( pickler_tests ) import TSN.XML.AutoRacingResults ( auto_racing_results_tests ) import TSN.XML.AutoRacingSchedule ( auto_racing_schedule_tests ) @@ -31,6 +32,7 @@ tests = testGroup mlb_early_line_tests, news_tests, odds_tests, + parse_tests, pickler_tests, schedule_changes_tests, scores_tests,