]> gitweb.michael.orlitzky.com - dead/htsn-import.git/blob - src/TSN/Parse.hs
9fe51259ddb518b46a83338fe55792226af7bbae
[dead/htsn-import.git] / src / TSN / Parse.hs
1 module TSN.Parse (
2 ParseError,
3 format_parse_error,
4 parse_game_id,
5 parse_message,
6 parse_schedule_id,
7 parse_tests,
8 parse_time_stamp,
9 parse_xml_time_stamp,
10 parse_xmlfid,
11 time_format,
12 time_stamp_format )
13 where
14
15 import Data.Either.Utils ( maybeToEither )
16 import Data.Time.Clock ( UTCTime )
17 import Data.Time.Format ( parseTime )
18 import System.Locale ( defaultTimeLocale )
19 import Test.Tasty ( TestTree, testGroup )
20 import Test.Tasty.HUnit ( (@?=), testCase )
21 import Text.Read ( readMaybe )
22 import Text.XML.HXT.Core (
23 XmlTree,
24 (>>>),
25 (/>),
26 getChildren,
27 getText,
28 hasName,
29 runLA )
30
31 -- Local imports
32 import Xml ( unsafe_read_document )
33
34
35 -- | When parsing an element from an XML document (like the
36 -- XML_File_ID), there are a few things that can happen. First of
37 -- all, it can work. Good for you.
38 --
39 -- Or, you may find nothing. Like, the element is missing. We
40 -- represent that with a 'ParseNotFound' containing the name of
41 -- thing thing not-found as a 'String'.
42 --
43 -- Finally, you could find something, but be unable to interpret it
44 -- as the type you were expecting. For example, if you parse
45 -- \"WHATSUP\" out of a \<game_id\> which is supposed to contain
46 -- integers. We represent this case with a 'ParseMismatch'
47 -- containing the name of the thing that you were looking for, the
48 -- value that had the unexpected type, and finally the name of the
49 -- expected type (used in error messages).
50 --
51 data ParseError =
52 ParseNotFound String | ParseMismatch String String String
53 deriving (Eq, Show)
54
55
56 -- | Take a 'ParseError' and turn it into a human-readable description
57 -- of the problem.
58 --
59 format_parse_error :: ParseError -> String
60 format_parse_error (ParseNotFound item) =
61 "No " ++ item ++ " elements found."
62 format_parse_error (ParseMismatch item val expected_type) =
63 "Could not parse " ++ item ++ " " ++ val ++ " as " ++ expected_type ++ "."
64
65
66
67 -- | Parse the \"message\" element out of a document tree and return
68 -- it as an 'XmlTree'. We use an 'Either' for consistency.
69 --
70 parse_message :: XmlTree -> Either ParseError XmlTree
71 parse_message xmltree =
72 case elements of
73 [] -> Left $ ParseNotFound "message"
74 (x:_) -> Right x
75 where
76 parse :: XmlTree -> [XmlTree]
77 parse = runLA $ hasName "/" /> hasName "message"
78
79 elements = parse xmltree
80
81
82
83
84
85 -- | Parse an 'Int' from a direct descendent of the (top-level)
86 -- \<message\> element in an XmlTree. This is used to implement the
87 -- XML_File_ID, game_id, and schedule_id (the last two are specific
88 -- to "TSN.XML.GameInfo") parsers.
89 --
90 -- If the parse fails, we return the corresponding 'ParseError'
91 -- wrapped in a 'Left'. Otherwise the parsed value is returned in a
92 -- 'Right'.
93 --
94 parse_message_int :: String -> XmlTree -> Either ParseError Int
95 parse_message_int child xmltree =
96 case parse_results of
97 [] -> Left $ ParseNotFound child
98 (x:_) -> x
99 where
100 parse :: XmlTree -> [String]
101 parse = runLA $ hasName "/"
102 /> hasName "message"
103 /> hasName child
104 >>> getChildren
105 >>> getText
106
107 read_either_int :: String -> Either ParseError Int
108 read_either_int s =
109 maybeToEither (ParseMismatch child s "integer") (readMaybe s)
110
111 elements = parse xmltree
112 parse_results = map read_either_int elements
113
114
115
116 -- | Extract the \"XML_File_ID\" element from a document. If we fail
117 -- to parse an XML_File_ID, we return an appropriate 'ParseError'
118 -- wrapped in a 'Left' constructor. The reason should be one of two
119 -- things:
120 --
121 -- 1. No XML_File_ID elements were found.
122 --
123 -- 2. An XML_File_ID element was found, but it could not be read
124 -- into an Int.
125 --
126 -- In general we expect some non-integer XML_File_IDs, because they
127 -- appear on the feed. But the htsn daemon refuses to save them at
128 -- the moment, so if we ever see an XML_File_ID that we can't parse,
129 -- it's truly an error.
130 --
131 parse_xmlfid :: XmlTree -> Either ParseError Int
132 parse_xmlfid = parse_message_int "XML_File_ID"
133
134
135
136 -- | Extract the \<game_id\> element from within the top-level
137 -- \<message\> of a document. These appear in the "TSN.XML.GameInfo"
138 -- documents. Unlike the \<schedule_id\> and \<XML_File_ID\>
139 -- elements, the \<game_id\> can be missing from GameInfo
140 -- documents. So even the 'Right' value of the 'Either' can be
141 -- \"missing\". There are two reasons that the parse might fail.
142 --
143 -- 1. No such elements were found. This is expected sometimes, and
144 -- should be returned as a 'Right' 'Nothing'.
145 --
146 -- 2. An element was found, but it could not be read into an
147 -- 'Int'. This is NOT expected, and will be returned as a
148 -- 'ParseError', wrapped in a 'Left'.
149 --
150 -- Most of implementation for this ('parse_message_int') is shared,
151 -- but to handle the fact that game_id is optional, we pattern match
152 -- on the 'ParseError' that comes back in case of failure. If we
153 -- didn't find any game_id elements, we turn that into a
154 -- \"successful nothing\". But if we find a game_id and it can't be
155 -- parsed, we let the error propagate, because that shouldn't
156 -- happen. Of course, if the parse worked, that's nice too: we wrap
157 -- the parsed value in a 'Just' and return that wrapped in a 'Right'
158 --
159 parse_game_id :: XmlTree -> Either ParseError (Maybe Int)
160 parse_game_id xml =
161 case (parse_message_int "game_id" xml) of
162 Left (ParseNotFound _) -> Right Nothing
163 Left pm@(ParseMismatch {}) -> Left pm
164 Right whatever -> Right (Just whatever)
165
166
167
168 -- | Extract the \<schedule_id\> element from within the top-level
169 -- \<message\> of a document. These appear in the
170 -- "TSN.XML.GameInfo" documents. If we fail to parse a schedule_id,
171 -- we return the reason wrapped in an appropriate 'ParseError'. The reason
172 -- should be one of two things:
173 --
174 -- 1. No such elements were found.
175 --
176 -- 2. An element was found, but it could not be read
177 -- into an Int.
178 --
179 -- Both of these are truly errors in the case of schedule_id. The
180 -- implementation for this ('parse_message_int') is shared among a
181 -- few functions.
182 --
183 parse_schedule_id :: XmlTree -> Either ParseError Int
184 parse_schedule_id = parse_message_int "schedule_id"
185
186
187
188 -- | The format string for times appearing in the feed.
189 --
190 time_format :: String
191 time_format = "%I:%M %p"
192
193
194
195 -- | The format string for a time_stamp. We keep the leading/trailing
196 -- space so that parseTime and formatTime are inverses are one
197 -- another, even though there is some confusion as to how these two
198 -- functions should behave:
199 --
200 -- <https://ghc.haskell.org/trac/ghc/ticket/9150>
201 --
202 time_stamp_format :: String
203 time_stamp_format = " %B %-d, %Y, at " ++ time_format ++ " ET "
204
205
206
207 -- | Parse a time stamp from a 'String' (maybe). TSN doesn't provide a
208 -- proper time zone name, so we parse it as UTC, and maybe our
209 -- eventual consumer can figure out a way to deduce the time zone.
210 --
211 parse_time_stamp :: String -> Maybe UTCTime
212 parse_time_stamp =
213 parseTime defaultTimeLocale time_stamp_format
214
215
216
217 -- | Extract the \"time_stamp\" element from a document. If we fail to
218 -- parse a time_stamp, we return an appropriate 'ParseError'. The
219 -- reason should be one of two things:
220 --
221 -- 1. No time_Stamp elements were found.
222 --
223 -- 2. A time_stamp element was found, but it could not be read
224 -- into a UTCTime.
225 --
226 -- We don't expect to run into any time_stamps that we can't parse,
227 -- and they can never be missing, so both conditions are truly
228 -- errors.
229 --
230 parse_xml_time_stamp :: XmlTree -> Either ParseError UTCTime
231 parse_xml_time_stamp xmltree =
232 case parse_results of
233 [] -> Left $ ParseNotFound "time_stamp"
234 (x:_) -> x
235 where
236 parse :: XmlTree -> [String]
237 parse = runLA $ hasName "/"
238 /> hasName "message"
239 /> hasName "time_stamp"
240 >>> getChildren
241 >>> getText
242
243 read_either_utctime :: String -> Either ParseError UTCTime
244 read_either_utctime s =
245 maybeToEither (ParseMismatch "time_stamp" s "date/time")
246 (parse_time_stamp s)
247
248 elements = parse xmltree
249 parse_results = map read_either_utctime elements
250
251
252
253 --
254 -- * Tests
255 --
256
257 -- | A list of all tests for this module.
258 --
259 parse_tests :: TestTree
260 parse_tests =
261 testGroup
262 "TSN.Parse tests"
263 [ test_parse_game_id,
264 test_parse_missing_game_id,
265 test_parse_schedule_id,
266 test_parse_xmlfid ]
267 where
268 sample_path :: String
269 sample_path = "test/xml/gameinfo/CBASK_Lineup_XML.xml"
270
271 desc :: String -> String
272 desc child = "a known " ++ child ++ " is parsed correctly"
273
274
275 -- | Actual implementation of the test for parse_xmlfid,
276 -- parse_game_id, and parse_schedule_id.
277 --
278 test_child :: String -> Int -> TestTree
279 test_child child expected =
280 testCase (desc child) $ do
281 xmltree <- unsafe_read_document sample_path
282 let actual = parse_message_int child xmltree
283 actual @?= (Right expected)
284
285
286 -- | Make sure we can parse a game_id into the expected value.
287 --
288 test_parse_game_id :: TestTree
289 test_parse_game_id = test_child "game_id" 97865
290
291
292 -- | Make sure we can parse a schedule_id (different from the
293 -- game_id) into the expected value.
294 --
295 test_parse_schedule_id :: TestTree
296 test_parse_schedule_id = test_child "schedule_id" 10199
297
298
299 -- | Make sure we can parse an XML_File_ID into the expected value.
300 --
301 test_parse_xmlfid :: TestTree
302 test_parse_xmlfid = test_child "XML_File_ID" 17
303
304
305
306 -- | The game_id element can be missing, so we test that too.
307 --
308 test_parse_missing_game_id :: TestTree
309 test_parse_missing_game_id =
310 testCase "missing game_id is not an error" $ do
311 xmltree <- unsafe_read_document "test/xml/gameinfo/MLB_Matchup_XML.xml"
312 let actual = parse_game_id xmltree
313 let expected = Right Nothing
314 actual @?= expected