X-Git-Url: https://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=src%2FTSN%2FParse.hs;h=9fe51259ddb518b46a83338fe55792226af7bbae;hb=c8ec4174a46c44215ef9540a9b19b99323fb0717;hp=54175d762ed52ca72052fcb297dd632d30087eee;hpb=4ce681700509beedf38026568ea20102801e6516;p=dead%2Fhtsn-import.git diff --git a/src/TSN/Parse.hs b/src/TSN/Parse.hs index 54175d7..9fe5125 100644 --- a/src/TSN/Parse.hs +++ b/src/TSN/Parse.hs @@ -1,5 +1,10 @@ module TSN.Parse ( + ParseError, + format_parse_error, + parse_game_id, parse_message, + parse_schedule_id, + parse_tests, parse_time_stamp, parse_xml_time_stamp, parse_xmlfid, @@ -11,6 +16,8 @@ import Data.Either.Utils ( maybeToEither ) import Data.Time.Clock ( UTCTime ) import Data.Time.Format ( parseTime ) import System.Locale ( defaultTimeLocale ) +import Test.Tasty ( TestTree, testGroup ) +import Test.Tasty.HUnit ( (@?=), testCase ) import Text.Read ( readMaybe ) import Text.XML.HXT.Core ( XmlTree, @@ -21,17 +28,49 @@ import Text.XML.HXT.Core ( hasName, runLA ) +-- Local imports +import Xml ( unsafe_read_document ) --- | Parse the \"message\" element out of a document tree and return --- it as an 'XmlTree'. We use an Either for consistency. + +-- | When parsing an element from an XML document (like the +-- XML_File_ID), there are a few things that can happen. First of +-- all, it can work. Good for you. +-- +-- Or, you may find nothing. Like, the element is missing. We +-- represent that with a 'ParseNotFound' containing the name of +-- thing thing not-found as a 'String'. -- --- Note: It's more trouble than it's worth to attempt to use this as --- the basis for parse_xmlfid and parse_xml_time_stamp. +-- Finally, you could find something, but be unable to interpret it +-- as the type you were expecting. For example, if you parse +-- \"WHATSUP\" out of a \ which is supposed to contain +-- integers. We represent this case with a 'ParseMismatch' +-- containing the name of the thing that you were looking for, the +-- value that had the unexpected type, and finally the name of the +-- expected type (used in error messages). -- -parse_message :: XmlTree -> Either String XmlTree +data ParseError = + ParseNotFound String | ParseMismatch String String String + deriving (Eq, Show) + + +-- | Take a 'ParseError' and turn it into a human-readable description +-- of the problem. +-- +format_parse_error :: ParseError -> String +format_parse_error (ParseNotFound item) = + "No " ++ item ++ " elements found." +format_parse_error (ParseMismatch item val expected_type) = + "Could not parse " ++ item ++ " " ++ val ++ " as " ++ expected_type ++ "." + + + +-- | Parse the \"message\" element out of a document tree and return +-- it as an 'XmlTree'. We use an 'Either' for consistency. +-- +parse_message :: XmlTree -> Either ParseError XmlTree parse_message xmltree = case elements of - [] -> Left "No message elements found." + [] -> Left $ ParseNotFound "message" (x:_) -> Right x where parse :: XmlTree -> [XmlTree] @@ -41,46 +80,108 @@ parse_message xmltree = --- | Extract the \"XML_File_ID\" element from a document. If we fail --- to parse an XML_File_ID, we return the reason wrapped in a 'Left' --- constructor. The reason should be one of two things: --- --- 1. No XML_File_ID elements were found. --- --- 2. An XML_File_ID element was found, but it could not be read --- into an Integer. --- --- We use an Either rather than a Maybe because we do expect some --- non-integer XML_File_IDs. In the examples, you will see --- NHL_DepthChart_XML.XML with an XML_File_ID of \"49618.61\" and --- CFL_Boxscore_XML1.xml with an XML_File_ID of --- \"R28916\". According to Brijesh Patel of TSN, these are special --- category files and not part of the usual feed. + + +-- | Parse an 'Int' from a direct descendent of the (top-level) +-- \ element in an XmlTree. This is used to implement the +-- XML_File_ID, game_id, and schedule_id (the last two are specific +-- to "TSN.XML.GameInfo") parsers. -- --- TODO: This should eventually be combined with XML.parse_xmlfid --- from the htsn package. +-- If the parse fails, we return the corresponding 'ParseError' +-- wrapped in a 'Left'. Otherwise the parsed value is returned in a +-- 'Right'. -- -parse_xmlfid :: XmlTree -> Either String Integer -parse_xmlfid xmltree = +parse_message_int :: String -> XmlTree -> Either ParseError Int +parse_message_int child xmltree = case parse_results of - [] -> Left "No XML_File_ID elements found." + [] -> Left $ ParseNotFound child (x:_) -> x where parse :: XmlTree -> [String] parse = runLA $ hasName "/" /> hasName "message" - /> hasName "XML_File_ID" + /> hasName child >>> getChildren >>> getText - read_either_integer :: String -> Either String Integer - read_either_integer s = - let msg = "Could not parse XML_File_ID " ++ s ++ " as an integer." - in - maybeToEither msg (readMaybe s) + read_either_int :: String -> Either ParseError Int + read_either_int s = + maybeToEither (ParseMismatch child s "integer") (readMaybe s) elements = parse xmltree - parse_results = map read_either_integer elements + parse_results = map read_either_int elements + + + +-- | Extract the \"XML_File_ID\" element from a document. If we fail +-- to parse an XML_File_ID, we return an appropriate 'ParseError' +-- wrapped in a 'Left' constructor. The reason should be one of two +-- things: +-- +-- 1. No XML_File_ID elements were found. +-- +-- 2. An XML_File_ID element was found, but it could not be read +-- into an Int. +-- +-- In general we expect some non-integer XML_File_IDs, because they +-- appear on the feed. But the htsn daemon refuses to save them at +-- the moment, so if we ever see an XML_File_ID that we can't parse, +-- it's truly an error. +-- +parse_xmlfid :: XmlTree -> Either ParseError Int +parse_xmlfid = parse_message_int "XML_File_ID" + + + +-- | Extract the \ element from within the top-level +-- \ of a document. These appear in the "TSN.XML.GameInfo" +-- documents. Unlike the \ and \ +-- elements, the \ can be missing from GameInfo +-- documents. So even the 'Right' value of the 'Either' can be +-- \"missing\". There are two reasons that the parse might fail. +-- +-- 1. No such elements were found. This is expected sometimes, and +-- should be returned as a 'Right' 'Nothing'. +-- +-- 2. An element was found, but it could not be read into an +-- 'Int'. This is NOT expected, and will be returned as a +-- 'ParseError', wrapped in a 'Left'. +-- +-- Most of implementation for this ('parse_message_int') is shared, +-- but to handle the fact that game_id is optional, we pattern match +-- on the 'ParseError' that comes back in case of failure. If we +-- didn't find any game_id elements, we turn that into a +-- \"successful nothing\". But if we find a game_id and it can't be +-- parsed, we let the error propagate, because that shouldn't +-- happen. Of course, if the parse worked, that's nice too: we wrap +-- the parsed value in a 'Just' and return that wrapped in a 'Right' +-- +parse_game_id :: XmlTree -> Either ParseError (Maybe Int) +parse_game_id xml = + case (parse_message_int "game_id" xml) of + Left (ParseNotFound _) -> Right Nothing + Left pm@(ParseMismatch {}) -> Left pm + Right whatever -> Right (Just whatever) + + + +-- | Extract the \ element from within the top-level +-- \ of a document. These appear in the +-- "TSN.XML.GameInfo" documents. If we fail to parse a schedule_id, +-- we return the reason wrapped in an appropriate 'ParseError'. The reason +-- should be one of two things: +-- +-- 1. No such elements were found. +-- +-- 2. An element was found, but it could not be read +-- into an Int. +-- +-- Both of these are truly errors in the case of schedule_id. The +-- implementation for this ('parse_message_int') is shared among a +-- few functions. +-- +parse_schedule_id :: XmlTree -> Either ParseError Int +parse_schedule_id = parse_message_int "schedule_id" @@ -90,6 +191,7 @@ time_format :: String time_format = "%I:%M %p" + -- | The format string for a time_stamp. We keep the leading/trailing -- space so that parseTime and formatTime are inverses are one -- another, even though there is some confusion as to how these two @@ -112,23 +214,23 @@ parse_time_stamp = --- | Extract the \"time_stamp\" element from a document. If we fail --- to parse a time_stamp, we return the reason wrapped in a 'Left' --- constructor. The reason should be one of two things: +-- | Extract the \"time_stamp\" element from a document. If we fail to +-- parse a time_stamp, we return an appropriate 'ParseError'. The +-- reason should be one of two things: -- -- 1. No time_Stamp elements were found. -- -- 2. A time_stamp element was found, but it could not be read -- into a UTCTime. -- --- Unline 'parse_xmlfid', we don't expect to run into any time_stamps --- that we can't parse. But since parse_xmlfid returns an Either, we --- do for consistency. +-- We don't expect to run into any time_stamps that we can't parse, +-- and they can never be missing, so both conditions are truly +-- errors. -- -parse_xml_time_stamp :: XmlTree -> Either String UTCTime +parse_xml_time_stamp :: XmlTree -> Either ParseError UTCTime parse_xml_time_stamp xmltree = case parse_results of - [] -> Left "No time_stamp elements found." + [] -> Left $ ParseNotFound "time_stamp" (x:_) -> x where parse :: XmlTree -> [String] @@ -138,11 +240,75 @@ parse_xml_time_stamp xmltree = >>> getChildren >>> getText - read_either_utctime :: String -> Either String UTCTime + read_either_utctime :: String -> Either ParseError UTCTime read_either_utctime s = - let msg = "Could not parse time_stamp " ++ s ++ " as a date/time." - in - maybeToEither msg (parse_time_stamp s) + maybeToEither (ParseMismatch "time_stamp" s "date/time") + (parse_time_stamp s) elements = parse xmltree parse_results = map read_either_utctime elements + + + +-- +-- * Tests +-- + +-- | A list of all tests for this module. +-- +parse_tests :: TestTree +parse_tests = + testGroup + "TSN.Parse tests" + [ test_parse_game_id, + test_parse_missing_game_id, + test_parse_schedule_id, + test_parse_xmlfid ] + where + sample_path :: String + sample_path = "test/xml/gameinfo/CBASK_Lineup_XML.xml" + + desc :: String -> String + desc child = "a known " ++ child ++ " is parsed correctly" + + + -- | Actual implementation of the test for parse_xmlfid, + -- parse_game_id, and parse_schedule_id. + -- + test_child :: String -> Int -> TestTree + test_child child expected = + testCase (desc child) $ do + xmltree <- unsafe_read_document sample_path + let actual = parse_message_int child xmltree + actual @?= (Right expected) + + + -- | Make sure we can parse a game_id into the expected value. + -- + test_parse_game_id :: TestTree + test_parse_game_id = test_child "game_id" 97865 + + + -- | Make sure we can parse a schedule_id (different from the + -- game_id) into the expected value. + -- + test_parse_schedule_id :: TestTree + test_parse_schedule_id = test_child "schedule_id" 10199 + + + -- | Make sure we can parse an XML_File_ID into the expected value. + -- + test_parse_xmlfid :: TestTree + test_parse_xmlfid = test_child "XML_File_ID" 17 + + + +-- | The game_id element can be missing, so we test that too. +-- +test_parse_missing_game_id :: TestTree +test_parse_missing_game_id = + testCase "missing game_id is not an error" $ do + xmltree <- unsafe_read_document "test/xml/gameinfo/MLB_Matchup_XML.xml" + let actual = parse_game_id xmltree + let expected = Right Nothing + actual @?= expected