]> gitweb.michael.orlitzky.com - dead/htsn-import.git/blob - src/TSN/Parse.hs
Make the game_info schedule_id optional and add a test case for it.
[dead/htsn-import.git] / src / TSN / Parse.hs
1 module TSN.Parse (
2 ParseError,
3 format_parse_error,
4 parse_game_id,
5 parse_message,
6 parse_schedule_id,
7 parse_tests,
8 parse_time_stamp,
9 parse_xml_time_stamp,
10 parse_xmlfid,
11 time_format,
12 time_stamp_format )
13 where
14
15 import Data.Either.Utils ( maybeToEither )
16 import Data.Time.Clock ( UTCTime )
17 import Data.Time.Format ( parseTime )
18 import System.Locale ( defaultTimeLocale )
19 import Test.Tasty ( TestTree, testGroup )
20 import Test.Tasty.HUnit ( (@?=), testCase )
21 import Text.Read ( readMaybe )
22 import Text.XML.HXT.Core (
23 XmlTree,
24 (>>>),
25 (/>),
26 getChildren,
27 getText,
28 hasName,
29 runLA )
30
31 -- Local imports
32 import Xml ( unsafe_read_document )
33
34
35 -- | When parsing an element from an XML document (like the
36 -- XML_File_ID), there are a few things that can happen. First of
37 -- all, it can work. Good for you.
38 --
39 -- Or, you may find nothing. Like, the element is missing. We
40 -- represent that with a 'ParseNotFound' containing the name of
41 -- thing thing not-found as a 'String'.
42 --
43 -- Finally, you could find something, but be unable to interpret it
44 -- as the type you were expecting. For example, if you parse
45 -- \"WHATSUP\" out of a \<game_id\> which is supposed to contain
46 -- integers. We represent this case with a 'ParseMismatch'
47 -- containing the name of the thing that you were looking for, the
48 -- value that had the unexpected type, and finally the name of the
49 -- expected type (used in error messages).
50 --
51 data ParseError =
52 ParseNotFound String | ParseMismatch String String String
53 deriving (Eq, Show)
54
55
56 -- | Take a 'ParseError' and turn it into a human-readable description
57 -- of the problem.
58 --
59 format_parse_error :: ParseError -> String
60 format_parse_error (ParseNotFound item) =
61 "No " ++ item ++ " elements found."
62 format_parse_error (ParseMismatch item val expected_type) =
63 "Could not parse " ++ item ++ " " ++ val ++ " as " ++ expected_type ++ "."
64
65
66
67 -- | Parse the \"message\" element out of a document tree and return
68 -- it as an 'XmlTree'. We use an 'Either' for consistency.
69 --
70 parse_message :: XmlTree -> Either ParseError XmlTree
71 parse_message xmltree =
72 case elements of
73 [] -> Left $ ParseNotFound "message"
74 (x:_) -> Right x
75 where
76 parse :: XmlTree -> [XmlTree]
77 parse = runLA $ hasName "/" /> hasName "message"
78
79 elements = parse xmltree
80
81
82
83
84
85 -- | Parse an 'Int' from a direct descendent of the (top-level)
86 -- \<message\> element in an XmlTree. This is used to implement the
87 -- XML_File_ID, game_id, and schedule_id (the last two are specific
88 -- to "TSN.XML.GameInfo") parsers.
89 --
90 -- If the parse fails, we return the corresponding 'ParseError'
91 -- wrapped in a 'Left'. Otherwise the parsed value is returned in a
92 -- 'Right'.
93 --
94 parse_message_int :: String -> XmlTree -> Either ParseError Int
95 parse_message_int child xmltree =
96 case parse_results of
97 [] -> Left $ ParseNotFound child
98 (x:_) -> x
99 where
100 parse :: XmlTree -> [String]
101 parse = runLA $ hasName "/"
102 /> hasName "message"
103 /> hasName child
104 >>> getChildren
105 >>> getText
106
107 read_either_int :: String -> Either ParseError Int
108 read_either_int s =
109 maybeToEither (ParseMismatch child s "integer") (readMaybe s)
110
111 elements = parse xmltree
112 parse_results = map read_either_int elements
113
114
115
116 -- | Parse an optional 'Int' from a direct descendent of the
117 -- (top-level) \<message\> element in an XmlTree. This is just like
118 -- 'parse_message_int', except we expect the element/value to be
119 -- missing sometimes.
120 --
121 -- To handle the fact that the element/value is optional, we pattern
122 -- match on the 'ParseError' that comes back in case of failure. If
123 -- we didn't find anything, we turn that into a \"successful
124 -- nothing\". But if we find a value and it can't be parsed, we let
125 -- the error propagate, because that shouldn't happen. Of course, if
126 -- the parse worked, that's nice too: we wrap the parsed value in a
127 -- 'Just' and return that wrapped in a 'Right'
128 --
129 parse_message_int_optional :: String
130 -> XmlTree
131 -> Either ParseError (Maybe Int)
132 parse_message_int_optional child xmltree =
133 case (parse_message_int child xmltree) of
134 Left (ParseNotFound _) -> Right Nothing
135 Left pm@(ParseMismatch {}) -> Left pm
136 Right whatever -> Right (Just whatever)
137
138
139
140 -- | Extract the \"XML_File_ID\" element from a document. If we fail
141 -- to parse an XML_File_ID, we return an appropriate 'ParseError'
142 -- wrapped in a 'Left' constructor. The reason should be one of two
143 -- things:
144 --
145 -- 1. No XML_File_ID elements were found.
146 --
147 -- 2. An XML_File_ID element was found, but it could not be read
148 -- into an Int.
149 --
150 -- In general we expect some non-integer XML_File_IDs, because they
151 -- appear on the feed. But the htsn daemon refuses to save them at
152 -- the moment, so if we ever see an XML_File_ID that we can't parse,
153 -- it's truly an error.
154 --
155 parse_xmlfid :: XmlTree -> Either ParseError Int
156 parse_xmlfid = parse_message_int "XML_File_ID"
157
158
159
160 -- | Extract the \<game_id\> element from within the top-level
161 -- \<message\> of a document. These appear in the "TSN.XML.GameInfo"
162 -- documents. Unlike the \<XML_File_ID\> elements, the \<game_id\>
163 -- can be missing from GameInfo documents, so for our implementation
164 -- we use 'parse_message_int_optional' instead.
165 --
166 parse_game_id :: XmlTree -> Either ParseError (Maybe Int)
167 parse_game_id = parse_message_int_optional "game_id"
168
169
170
171 -- | Extract the \<schedule_id\> element from within the top-level
172 -- \<message\> of a document. Identical to 'parse_game_id' except
173 -- for the element name.
174 --
175 parse_schedule_id :: XmlTree -> Either ParseError (Maybe Int)
176 parse_schedule_id = parse_message_int_optional "schedule_id"
177
178
179
180 -- | The format string for times appearing in the feed.
181 --
182 time_format :: String
183 time_format = "%I:%M %p"
184
185
186
187 -- | The format string for a time_stamp. We keep the leading/trailing
188 -- space so that parseTime and formatTime are inverses are one
189 -- another, even though there is some confusion as to how these two
190 -- functions should behave:
191 --
192 -- <https://ghc.haskell.org/trac/ghc/ticket/9150>
193 --
194 time_stamp_format :: String
195 time_stamp_format = " %B %-d, %Y, at " ++ time_format ++ " ET "
196
197
198
199 -- | Parse a time stamp from a 'String' (maybe). TSN doesn't provide a
200 -- proper time zone name, so we parse it as UTC, and maybe our
201 -- eventual consumer can figure out a way to deduce the time zone.
202 --
203 parse_time_stamp :: String -> Maybe UTCTime
204 parse_time_stamp =
205 parseTime defaultTimeLocale time_stamp_format
206
207
208
209 -- | Extract the \"time_stamp\" element from a document. If we fail to
210 -- parse a time_stamp, we return an appropriate 'ParseError'. The
211 -- reason should be one of two things:
212 --
213 -- 1. No time_Stamp elements were found.
214 --
215 -- 2. A time_stamp element was found, but it could not be read
216 -- into a UTCTime.
217 --
218 -- We don't expect to run into any time_stamps that we can't parse,
219 -- and they can never be missing, so both conditions are truly
220 -- errors.
221 --
222 parse_xml_time_stamp :: XmlTree -> Either ParseError UTCTime
223 parse_xml_time_stamp xmltree =
224 case parse_results of
225 [] -> Left $ ParseNotFound "time_stamp"
226 (x:_) -> x
227 where
228 parse :: XmlTree -> [String]
229 parse = runLA $ hasName "/"
230 /> hasName "message"
231 /> hasName "time_stamp"
232 >>> getChildren
233 >>> getText
234
235 read_either_utctime :: String -> Either ParseError UTCTime
236 read_either_utctime s =
237 maybeToEither (ParseMismatch "time_stamp" s "date/time")
238 (parse_time_stamp s)
239
240 elements = parse xmltree
241 parse_results = map read_either_utctime elements
242
243
244
245 --
246 -- * Tests
247 --
248
249 -- | A list of all tests for this module.
250 --
251 parse_tests :: TestTree
252 parse_tests =
253 testGroup
254 "TSN.Parse tests"
255 [ test_parse_game_id,
256 test_parse_missing_game_id,
257 test_parse_missing_schedule_id,
258 test_parse_schedule_id,
259 test_parse_xmlfid ]
260 where
261 sample_path :: String
262 sample_path = "test/xml/gameinfo/CBASK_Lineup_XML.xml"
263
264 desc :: String -> String
265 desc child = "a known " ++ child ++ " is parsed correctly"
266
267
268 -- | Actual implementation of the test for parse_xmlfid,
269 -- parse_game_id, and parse_schedule_id.
270 --
271 test_child :: String -> Int -> TestTree
272 test_child child expected =
273 testCase (desc child) $ do
274 xmltree <- unsafe_read_document sample_path
275 let actual = parse_message_int child xmltree
276 actual @?= (Right expected)
277
278
279 -- | Make sure we can parse a game_id into the expected value.
280 --
281 test_parse_game_id :: TestTree
282 test_parse_game_id = test_child "game_id" 97865
283
284
285 -- | Make sure we can parse a schedule_id (different from the
286 -- game_id) into the expected value.
287 --
288 test_parse_schedule_id :: TestTree
289 test_parse_schedule_id = test_child "schedule_id" 10199
290
291
292 -- | Make sure we can parse an XML_File_ID into the expected value.
293 --
294 test_parse_xmlfid :: TestTree
295 test_parse_xmlfid = test_child "XML_File_ID" 17
296
297
298
299 -- | The game_id element can be missing, so we test that too.
300 --
301 test_parse_missing_game_id :: TestTree
302 test_parse_missing_game_id =
303 testCase "missing game_id is not an error" $ do
304 xmltree <- unsafe_read_document "test/xml/gameinfo/MLB_Matchup_XML.xml"
305 let actual = parse_game_id xmltree
306 let expected = Right Nothing
307 actual @?= expected
308
309
310 -- | The schedule_id element can be missing, so we test that too.
311 --
312 test_parse_missing_schedule_id :: TestTree
313 test_parse_missing_schedule_id =
314 testCase "missing schedule_id is not an error" $ do
315 let path = "test/xml/gameinfo/recapxml-no-game-schedule-ids.xml"
316 xmltree <- unsafe_read_document path
317 let actual = parse_schedule_id xmltree
318 let expected = Right Nothing
319 actual @?= expected