module TSN.Parse (
+ ParseError,
+ format_parse_error,
+ parse_game_id,
parse_message,
+ parse_schedule_id,
+ parse_tests,
parse_time_stamp,
parse_xml_time_stamp,
parse_xmlfid,
import Data.Time.Clock ( UTCTime )
import Data.Time.Format ( parseTime )
import System.Locale ( defaultTimeLocale )
+import Test.Tasty ( TestTree, testGroup )
+import Test.Tasty.HUnit ( (@?=), testCase )
import Text.Read ( readMaybe )
import Text.XML.HXT.Core (
XmlTree,
hasName,
runLA )
+-- Local imports
+import Xml ( unsafe_read_document )
--- | Parse the \"message\" element out of a document tree and return
--- it as an 'XmlTree'. We use an Either for consistency.
+
+-- | When parsing an element from an XML document (like the
+-- XML_File_ID), there are a few things that can happen. First of
+-- all, it can work. Good for you.
+--
+-- Or, you may find nothing. Like, the element is missing. We
+-- represent that with a 'ParseNotFound' containing the name of
+-- thing thing not-found as a 'String'.
--
--- Note: It's more trouble than it's worth to attempt to use this as
--- the basis for parse_xmlfid and parse_xml_time_stamp.
+-- Finally, you could find something, but be unable to interpret it
+-- as the type you were expecting. For example, if you parse
+-- \"WHATSUP\" out of a \<game_id\> which is supposed to contain
+-- integers. We represent this case with a 'ParseMismatch'
+-- containing the name of the thing that you were looking for, the
+-- value that had the unexpected type, and finally the name of the
+-- expected type (used in error messages).
--
-parse_message :: XmlTree -> Either String XmlTree
+data ParseError =
+ ParseNotFound String | ParseMismatch String String String
+ deriving (Eq, Show)
+
+
+-- | Take a 'ParseError' and turn it into a human-readable description
+-- of the problem.
+--
+format_parse_error :: ParseError -> String
+format_parse_error (ParseNotFound item) =
+ "No " ++ item ++ " elements found."
+format_parse_error (ParseMismatch item val expected_type) =
+ "Could not parse " ++ item ++ " " ++ val ++ " as " ++ expected_type ++ "."
+
+
+
+-- | Parse the \"message\" element out of a document tree and return
+-- it as an 'XmlTree'. We use an 'Either' for consistency.
+--
+parse_message :: XmlTree -> Either ParseError XmlTree
parse_message xmltree =
case elements of
- [] -> Left "No message elements found."
+ [] -> Left $ ParseNotFound "message"
(x:_) -> Right x
where
parse :: XmlTree -> [XmlTree]
--- | Extract the \"XML_File_ID\" element from a document. If we fail
--- to parse an XML_File_ID, we return the reason wrapped in a 'Left'
--- constructor. The reason should be one of two things:
---
--- 1. No XML_File_ID elements were found.
---
--- 2. An XML_File_ID element was found, but it could not be read
--- into an Integer.
---
--- We use an Either rather than a Maybe because we do expect some
--- non-integer XML_File_IDs. In the examples, you will see
--- NHL_DepthChart_XML.XML with an XML_File_ID of \"49618.61\" and
--- CFL_Boxscore_XML1.xml with an XML_File_ID of
--- \"R28916\". According to Brijesh Patel of TSN, these are special
--- category files and not part of the usual feed.
+
+
+-- | Parse an 'Int' from a direct descendent of the (top-level)
+-- \<message\> element in an XmlTree. This is used to implement the
+-- XML_File_ID, game_id, and schedule_id (the last two are specific
+-- to "TSN.XML.GameInfo") parsers.
--
--- TODO: This should eventually be combined with XML.parse_xmlfid
--- from the htsn package.
+-- If the parse fails, we return the corresponding 'ParseError'
+-- wrapped in a 'Left'. Otherwise the parsed value is returned in a
+-- 'Right'.
--
-parse_xmlfid :: XmlTree -> Either String Integer
-parse_xmlfid xmltree =
+parse_message_int :: String -> XmlTree -> Either ParseError Int
+parse_message_int child xmltree =
case parse_results of
- [] -> Left "No XML_File_ID elements found."
+ [] -> Left $ ParseNotFound child
(x:_) -> x
where
parse :: XmlTree -> [String]
parse = runLA $ hasName "/"
/> hasName "message"
- /> hasName "XML_File_ID"
+ /> hasName child
>>> getChildren
>>> getText
- read_either_integer :: String -> Either String Integer
- read_either_integer s =
- let msg = "Could not parse XML_File_ID " ++ s ++ " as an integer."
- in
- maybeToEither msg (readMaybe s)
+ read_either_int :: String -> Either ParseError Int
+ read_either_int s =
+ maybeToEither (ParseMismatch child s "integer") (readMaybe s)
elements = parse xmltree
- parse_results = map read_either_integer elements
+ parse_results = map read_either_int elements
+
+
+
+-- | Extract the \"XML_File_ID\" element from a document. If we fail
+-- to parse an XML_File_ID, we return an appropriate 'ParseError'
+-- wrapped in a 'Left' constructor. The reason should be one of two
+-- things:
+--
+-- 1. No XML_File_ID elements were found.
+--
+-- 2. An XML_File_ID element was found, but it could not be read
+-- into an Int.
+--
+-- In general we expect some non-integer XML_File_IDs, because they
+-- appear on the feed. But the htsn daemon refuses to save them at
+-- the moment, so if we ever see an XML_File_ID that we can't parse,
+-- it's truly an error.
+--
+parse_xmlfid :: XmlTree -> Either ParseError Int
+parse_xmlfid = parse_message_int "XML_File_ID"
+
+
+
+-- | Extract the \<game_id\> element from within the top-level
+-- \<message\> of a document. These appear in the "TSN.XML.GameInfo"
+-- documents. Unlike the \<schedule_id\> and \<XML_File_ID\>
+-- elements, the \<game_id\> can be missing from GameInfo
+-- documents. So even the 'Right' value of the 'Either' can be
+-- \"missing\". There are two reasons that the parse might fail.
+--
+-- 1. No such elements were found. This is expected sometimes, and
+-- should be returned as a 'Right' 'Nothing'.
+--
+-- 2. An element was found, but it could not be read into an
+-- 'Int'. This is NOT expected, and will be returned as a
+-- 'ParseError', wrapped in a 'Left'.
+--
+-- Most of implementation for this ('parse_message_int') is shared,
+-- but to handle the fact that game_id is optional, we pattern match
+-- on the 'ParseError' that comes back in case of failure. If we
+-- didn't find any game_id elements, we turn that into a
+-- \"successful nothing\". But if we find a game_id and it can't be
+-- parsed, we let the error propagate, because that shouldn't
+-- happen. Of course, if the parse worked, that's nice too: we wrap
+-- the parsed value in a 'Just' and return that wrapped in a 'Right'
+--
+parse_game_id :: XmlTree -> Either ParseError (Maybe Int)
+parse_game_id xml =
+ case (parse_message_int "game_id" xml) of
+ Left (ParseNotFound _) -> Right Nothing
+ Left pm@(ParseMismatch {}) -> Left pm
+ Right whatever -> Right (Just whatever)
+
+
+
+-- | Extract the \<schedule_id\> element from within the top-level
+-- \<message\> of a document. These appear in the
+-- "TSN.XML.GameInfo" documents. If we fail to parse a schedule_id,
+-- we return the reason wrapped in an appropriate 'ParseError'. The reason
+-- should be one of two things:
+--
+-- 1. No such elements were found.
+--
+-- 2. An element was found, but it could not be read
+-- into an Int.
+--
+-- Both of these are truly errors in the case of schedule_id. The
+-- implementation for this ('parse_message_int') is shared among a
+-- few functions.
+--
+parse_schedule_id :: XmlTree -> Either ParseError Int
+parse_schedule_id = parse_message_int "schedule_id"
time_format = "%I:%M %p"
+
-- | The format string for a time_stamp. We keep the leading/trailing
-- space so that parseTime and formatTime are inverses are one
-- another, even though there is some confusion as to how these two
--- | Extract the \"time_stamp\" element from a document. If we fail
--- to parse a time_stamp, we return the reason wrapped in a 'Left'
--- constructor. The reason should be one of two things:
+-- | Extract the \"time_stamp\" element from a document. If we fail to
+-- parse a time_stamp, we return an appropriate 'ParseError'. The
+-- reason should be one of two things:
--
-- 1. No time_Stamp elements were found.
--
-- 2. A time_stamp element was found, but it could not be read
-- into a UTCTime.
--
--- Unline 'parse_xmlfid', we don't expect to run into any time_stamps
--- that we can't parse. But since parse_xmlfid returns an Either, we
--- do for consistency.
+-- We don't expect to run into any time_stamps that we can't parse,
+-- and they can never be missing, so both conditions are truly
+-- errors.
--
-parse_xml_time_stamp :: XmlTree -> Either String UTCTime
+parse_xml_time_stamp :: XmlTree -> Either ParseError UTCTime
parse_xml_time_stamp xmltree =
case parse_results of
- [] -> Left "No time_stamp elements found."
+ [] -> Left $ ParseNotFound "time_stamp"
(x:_) -> x
where
parse :: XmlTree -> [String]
>>> getChildren
>>> getText
- read_either_utctime :: String -> Either String UTCTime
+ read_either_utctime :: String -> Either ParseError UTCTime
read_either_utctime s =
- let msg = "Could not parse time_stamp " ++ s ++ " as a date/time."
- in
- maybeToEither msg (parse_time_stamp s)
+ maybeToEither (ParseMismatch "time_stamp" s "date/time")
+ (parse_time_stamp s)
elements = parse xmltree
parse_results = map read_either_utctime elements
+
+
+
+--
+-- * Tests
+--
+
+-- | A list of all tests for this module.
+--
+parse_tests :: TestTree
+parse_tests =
+ testGroup
+ "TSN.Parse tests"
+ [ test_parse_game_id,
+ test_parse_missing_game_id,
+ test_parse_schedule_id,
+ test_parse_xmlfid ]
+ where
+ sample_path :: String
+ sample_path = "test/xml/gameinfo/CBASK_Lineup_XML.xml"
+
+ desc :: String -> String
+ desc child = "a known " ++ child ++ " is parsed correctly"
+
+
+ -- | Actual implementation of the test for parse_xmlfid,
+ -- parse_game_id, and parse_schedule_id.
+ --
+ test_child :: String -> Int -> TestTree
+ test_child child expected =
+ testCase (desc child) $ do
+ xmltree <- unsafe_read_document sample_path
+ let actual = parse_message_int child xmltree
+ actual @?= (Right expected)
+
+
+ -- | Make sure we can parse a game_id into the expected value.
+ --
+ test_parse_game_id :: TestTree
+ test_parse_game_id = test_child "game_id" 97865
+
+
+ -- | Make sure we can parse a schedule_id (different from the
+ -- game_id) into the expected value.
+ --
+ test_parse_schedule_id :: TestTree
+ test_parse_schedule_id = test_child "schedule_id" 10199
+
+
+ -- | Make sure we can parse an XML_File_ID into the expected value.
+ --
+ test_parse_xmlfid :: TestTree
+ test_parse_xmlfid = test_child "XML_File_ID" 17
+
+
+
+-- | The game_id element can be missing, so we test that too.
+--
+test_parse_missing_game_id :: TestTree
+test_parse_missing_game_id =
+ testCase "missing game_id is not an error" $ do
+ xmltree <- unsafe_read_document "test/xml/gameinfo/MLB_Matchup_XML.xml"
+ let actual = parse_game_id xmltree
+ let expected = Right Nothing
+ actual @?= expected
ImportResult(..),
run_dbmigrate )
import TSN.Parse (
+ ParseError,
+ parse_game_id,
parse_message,
+ parse_schedule_id,
parse_xmlfid,
parse_xml_time_stamp )
import Xml ( unsafe_read_document )
-- | This serves as both the database and XML representation of a
-- GameInfo \<message\>.
--
+-- The 'game_id' and 'schedule_id' fields are foreign keys, but they
+-- key into multiple tables and key on records which may not exist
+-- when we import the GameInfo document. We therefore don't declare
+-- them as foreign keys; i.e. we don't require them to point
+-- anywhere in particular. But if they do, that's nice.
+--
data GameInfo =
GameInfo {
dtd :: String,
xml_file_id :: Int,
+ game_id :: Maybe Int, -- ^ These are optional because they are missing
+ -- from at least the MLB_Matchup_XML.dtd documents.
+ -- They provide foreign keys into any tables storing
+ -- games with their IDs.
+
+ schedule_id :: Int, -- ^ Required foreign key into any table storing a
+ -- schedule along with its ID.
time_stamp :: UTCTime,
xml :: String }
deriving (Eq, Show)
-- | Attempt to parse a 'GameInfo' from an 'XmlTree'. If we cannot,
-- we fail with an error message.
--
-parse_xml :: String -> XmlTree -> Either String GameInfo
+parse_xml :: String -> XmlTree -> Either ParseError GameInfo
parse_xml dtdname xmltree = do
xmlfid <- parse_xmlfid xmltree
+ game_id <- parse_game_id xmltree
+ schedule_id <- parse_schedule_id xmltree
timestamp <- parse_xml_time_stamp xmltree
message <- parse_message xmltree
- return $ GameInfo dtdname (fromInteger xmlfid) timestamp (xshow [message])
+ return $ GameInfo
+ dtdname
+ xmlfid
+ game_id
+ schedule_id
+ timestamp
+ (xshow [message])
--
--- Database code
+-- * Database code
--
instance DbImport GameInfo where
let ex2 = 21201550
let a3 = show $ time_stamp t
let ex3 = "2014-05-31 15:13:00 UTC"
- let a4 = take 9 (xml t)
- let ex4 = "<message>"
- let actual = (a1,a2,a3,a4)
- let expected = (ex1,ex2,ex3,ex4)
+ let a4 = game_id t
+ let ex4 = Just 39978
+ let a5 = schedule_id t
+ let ex5 = 39978
+ let a6 = take 9 (xml t)
+ let ex6 = "<message>"
+ let actual = (a1,a2,a3,a4,a5,a6)
+ let expected = (ex1,ex2,ex3,ex4,ex5,ex6)
actual @?= expected