From: Michael Orlitzky Date: Fri, 25 Jul 2014 03:10:41 +0000 (-0400) Subject: Modify TSN.XML.EarlyLine so it can support MLB_earlylineXML.dtd as well. X-Git-Tag: 0.0.9~7 X-Git-Url: https://gitweb.michael.orlitzky.com/?a=commitdiff_plain;h=6674a915d50cc41e4c5b12e42f2fcbb65e2bf195;p=dead%2Fhtsn-import.git Modify TSN.XML.EarlyLine so it can support MLB_earlylineXML.dtd as well. --- diff --git a/src/TSN/XML/EarlyLine.hs b/src/TSN/XML/EarlyLine.hs index 4ad9ae2..5aa2254 100644 --- a/src/TSN/XML/EarlyLine.hs +++ b/src/TSN/XML/EarlyLine.hs @@ -5,12 +5,24 @@ {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeFamilies #-} --- | Parse TSN XML for the DTD \"earlylineXML.dtd\". Each \ --- element contains a bunch of \s, and those \s --- contain a single \. In the database, we merge the date --- info into the games, and key the games to the messages. +-- | Parse TSN XML for the DTD \"earlylineXML.dtd\". For that DTD, +-- each \ element contains a bunch of \s, and those +-- \s contain a single \. In the database, we merge +-- the date info into the games, and key the games to the messages. +-- +-- Real life is not so simple, however. There is another module, +-- "TSN.XML.MLBEarlyLine" that is something of a subclass of this +-- one. It contains early lines, but only for MLB games. The data +-- types and XML schema are /almost/ the same, but TSN like to make +-- things difficult. +-- +-- A full list of the differences is given in that module. In this +-- one, we mention where data types have been twerked a little to +-- support the second document type. -- module TSN.XML.EarlyLine ( + EarlyLine, -- Used in TSN.XML.MLBEarlyLine + EarlyLineGame, -- Used in TSN.XML.MLBEarlyLine dtd, pickle_message, -- * Tests @@ -21,7 +33,7 @@ module TSN.XML.EarlyLine ( where -- System imports. -import Control.Monad ( forM_ ) +import Control.Monad ( join ) import Data.Time ( UTCTime(..) ) import Data.Tuple.Curry ( uncurryN ) import Database.Groundhog ( @@ -42,14 +54,15 @@ import Test.Tasty.HUnit ( (@?=), testCase ) import Text.XML.HXT.Core ( PU, xp4Tuple, + xp6Tuple, xp7Tuple, xpAttr, xpElem, xpInt, xpList, xpOption, + xpPair, xpText, - xpTriple, xpWrap ) -- Local imports. @@ -74,7 +87,7 @@ dtd :: String dtd = "earlylineXML.dtd" -- --- DB/XML data types +-- * DB/XML data types -- -- * EarlyLine/Message @@ -140,7 +153,29 @@ instance XmlImport Message --- * EarlyLineDate +-- * EarlyLineDate / EarlyLineGameWithNote + +-- | This is a very sad data type. It exists so that we can +-- successfully unpickle/pickle the MLB_earlylineXML.dtd documents +-- and get back what we started with. In that document type, the +-- dates all have multiple \s associated with them (as +-- children). But the dates also have multiple \s as +-- children, and we're supposed to figure out which notes go with +-- which games based on the order that they appear in the XML +-- file. Yeah, right. +-- +-- In any case, instead of expecting the games and notes in some +-- nice order, we use this data type to expect \"a game and maybe a +-- note\" multiple times. This will pair the notes with only one +-- game, rather than all of the games that TSN think it should go +-- with. But it allows us to pickle and unpickle correctly at least. +-- +data EarlyLineGameWithNote = + EarlyLineGameWithNote { + date_note :: Maybe String, + date_game :: EarlyLineGameXml } + deriving (Eq, Show) + -- | XML representation of a \. It has a \"value\" attribute -- containing the actual date string. As children it contains a @@ -150,65 +185,183 @@ instance XmlImport Message data EarlyLineDate = EarlyLineDate { date_value :: UTCTime, - date_note :: String, - date_game :: EarlyLineGameXml } + date_games_with_notes :: [EarlyLineGameWithNote] } deriving (Eq, Show) -- * EarlyLineGame / EarlyLineGameXml +-- | Database representation of a \ in earlylineXML.dtd and +-- MLB_earlylineXML.dtd. We've had to make a sacrifice here to +-- support both document types. Since it's not possible to pair the +-- \s with \s reliably in MLB_earlylineXML.dtd, we +-- have omitted the notes entirely. This is sad, but totally not our +-- fault. +-- +-- In earlylineXML.dtd, each \ and thus each \ is +-- paired with exactly one \, so if we only cared about that +-- document type, we could have retained the notes. +-- +-- In earlylinexml.DTD, the over/under is required, but in +-- MLB_earlylinexml.DTD it is not. So another compromise is to have +-- it optional here. +-- +-- The 'db_game_time' should be the combined date/time using the +-- date value from the \ element's containing +-- \. That's why EarlyLineGame isn't an instance of +-- 'FromXmlFk': the foreign key isn't enough to construct one, we +-- also need the date. +-- data EarlyLineGame = EarlyLineGame { db_early_lines_id :: DefaultKey EarlyLine, db_game_time :: UTCTime, -- ^ Combined date/time - db_note :: String, -- ^ Taken from the parent \ db_away_team :: EarlyLineGameTeam, db_home_team :: EarlyLineGameTeam, - db_over_under :: String } + db_over_under :: Maybe String } + +-- | XML representation of a 'EarlyLineGame'. Comparatively, it lacks +-- only the foreign key to the parent message. +-- data EarlyLineGameXml = EarlyLineGameXml { xml_game_time :: UTCTime, -- ^ Only an ambiguous time string, e.g. \"8:30\" - xml_away_team :: EarlyLineGameTeam, - xml_home_team :: EarlyLineGameTeam, - xml_over_under :: String } + xml_away_team :: EarlyLineGameTeamXml, + xml_home_team :: EarlyLineGameTeamXml, + xml_over_under :: Maybe String } deriving (Eq, Show) --- | XML representation of an earlyline team. It doubles as an + +-- * EarlyLineGameTeam / EarlyLineGameTeamXml + +-- | Database representation of an EarlyLine team, used in both +-- earlylineXML.dtd and MLB_earlylineXML.dtd. It doubles as an -- embedded type within the DB representation 'EarlyLineGame'. -- +-- The team name is /not/ optional. However, since we're overloading +-- the XML representation, we're constructing 'db_team_name' name +-- from two Maybes, 'xml_team_name_attr' and +-- 'xml_team_name_text'. To ensure type safety (and avoid a runtime +-- crash), we allow the database field to be optional as well. +-- data EarlyLineGameTeam = EarlyLineGameTeam { db_rotation_number :: Int, db_line :: Maybe String, -- ^ Can be blank, a Double, or \"off\". - db_team_name :: String } + db_team_name :: Maybe String, -- ^ NOT optional, see the data type docs. + db_pitcher :: Maybe String -- ^ Optional in MLB_earlylineXML.dtd, + -- always absent in earlylineXML.dtd. + } + + +-- | This here is an abomination. What we've got is an XML +-- representation, not for either earlylineXML.dtd or +-- MLB_earlylineXML.dtd, but one that will work for /both/. Even +-- though they represent the teams totally differently! Argh! +-- +-- The earlylineXML.dtd teams look like, +-- +-- \Miami\ +-- +-- While the MLB_earlylineXML.dtd teams look like, +-- +-- +-- D.Haren +-- -130 +-- +-- +-- So that's cool. This data type has placeholders that should allow +-- the name/line to appear either as an attribute or as a text +-- node. We'll sort it all out in the conversion to +-- EarlyLineGameTeam. +-- +data EarlyLineGameTeamXml = + EarlyLineGameTeamXml { + xml_rotation_number :: Int, + xml_line_attr :: Maybe String, + xml_team_name_attr :: Maybe String, + xml_team_name_text :: Maybe String, + xml_pitcher :: Maybe String, + xml_line_elem :: Maybe String } deriving (Eq, Show) --- | Convert an 'EarlyLineDate' into an 'EarlyLineGame'. Each date has --- exactly one game, and the fields that belong to the date should --- really be in the game anyway. So the database representation of a --- game has the combined fields of the XML date/game. + +instance ToDb EarlyLineGameTeamXml where + -- | The database analogue of a 'EarlyLineGameTeamXml' is an + -- 'EarlyLineGameTeam', although the DB type is merely embedded + -- in another type. + -- + type Db EarlyLineGameTeamXml = EarlyLineGameTeam + + +-- | The 'FromXml' instance for 'EarlyLineGameTeamXml' lets us convert +-- it to a 'EarlyLineGameTeam' easily. -- --- This function gets the game out of a date, and then sticks the --- date value and note inside the game. It also adds the foreign key --- reference to the game's parent message, and returns the result. +instance FromXml EarlyLineGameTeamXml where + -- | To convert a 'EarlyLineGameTeamXml' to an 'EarlyLineGameTeam', + -- we figure how its fields were represented and choose the ones + -- that are populated. For example if the \"line\" attribute was + -- there, we'll use it, but if now, we'll use the \ + -- element. + -- + from_xml EarlyLineGameTeamXml{..} = + EarlyLineGameTeam { + db_rotation_number = xml_rotation_number, + db_line = merge xml_line_attr xml_line_elem, + db_team_name = merge xml_team_name_attr xml_team_name_text, + db_pitcher = xml_pitcher } + where + merge :: Maybe String -> Maybe String -> Maybe String + merge Nothing y = y + merge x Nothing = x + merge _ _ = Nothing + + + + +-- | Convert an 'EarlyLineDate' into a list of 'EarlyLineGame's. Each +-- date has one or more games, and the fields that belong to the date +-- should really be in the game anyway. So the database +-- representation of a game has the combined fields of the XML +-- date/game. -- -date_to_game :: (DefaultKey EarlyLine) -> EarlyLineDate -> EarlyLineGame -date_to_game fk date = - EarlyLineGame { - db_early_lines_id = fk, - db_game_time = combined_date_time, - db_note = (date_note date), - db_away_team = xml_away_team (date_game date), - db_home_team = xml_home_team (date_game date), - db_over_under = xml_over_under (date_game date) } +-- This function gets the games out of a date, and then sticks the +-- date value inside the games. It also adds the foreign key +-- reference to the games' parent message, and returns the result. +-- +-- This would convert a single date to a single game if we only +-- needed to support earlylineXML.dtd and not MLB_earlylineXML.dtd. +-- +date_to_games :: (DefaultKey EarlyLine) -> EarlyLineDate -> [EarlyLineGame] +date_to_games fk date = + map convert_game games_only where - date_part = date_value date - time_part = xml_game_time (date_game date) - combined_date_time = UTCTime (utctDay date_part) (utctDayTime time_part) + -- | Get the list of games out of a date (i.e. drop the notes). + -- + games_only :: [EarlyLineGameXml] + games_only = (map date_game (date_games_with_notes date)) + + -- | Stick the date value into the given game. + -- + combine_date_time :: EarlyLineGameXml -> UTCTime + combine_date_time elgx = + UTCTime (utctDay $ date_value date) (utctDayTime $ xml_game_time elgx) + + -- | Convert an XML game to a database one. + -- + convert_game :: EarlyLineGameXml -> EarlyLineGame + convert_game gx = + EarlyLineGame { + db_early_lines_id = fk, + db_game_time = combine_date_time gx, + db_away_team = from_xml (xml_away_team gx), + db_home_team = from_xml (xml_home_team gx), + db_over_under = xml_over_under gx } + -- -- * Database stuff @@ -224,12 +377,16 @@ instance DbImport Message where -- Insert the message and obtain its ID. msg_id <- insert_xml m - -- Now loop through the message's s. - forM_ (xml_dates m) $ \date -> do - -- Each date only contains one game, so we convert the date to a - -- game and then insert the game (keyed to the message). - let game = date_to_game msg_id date - insert_ game + -- Create a function that will turn a list of dates into a list of + -- games by converting each date to its own list of games, and + -- then concatenating all of the game lists together. + let convert_dates_to_games = concatMap (date_to_games msg_id) + + -- Now use it to make dem games. + let games = convert_dates_to_games (xml_dates m) + + -- And insert all of them + mapM_ insert_ games return ImportSucceeded @@ -260,11 +417,13 @@ mkPersist tsn_codegen_config [groundhog| - {name: rotation_number, dbName: away_team_rotation_number} - {name: line, dbName: away_team_line} - {name: team_name, dbName: away_team_name} + - {name: pitcher, dbName: away_team_pitcher} - name: db_home_team embeddedType: - {name: rotation_number, dbName: home_team_rotation_number} - {name: line, dbName: home_team_line} - {name: team_name, dbName: home_team_name} + - {name: pitcher, dbName: home_team_pitcher} - embedded: EarlyLineGameTeam fields: @@ -274,7 +433,8 @@ mkPersist tsn_codegen_config [groundhog| dbName: line - name: db_team_name dbName: team_name - + - name: db_pitcher + dbName: pitcher |] @@ -308,22 +468,35 @@ pickle_message = xml_time_stamp m) + +-- | Pickler for a '\ followed by a \. We turn them into +-- a 'EarlyLineGameWithNote'. +-- +pickle_game_with_note :: PU EarlyLineGameWithNote +pickle_game_with_note = + xpWrap (from_tuple, to_tuple) $ + xpPair (xpOption $ xpElem "note" xpText) + pickle_game + where + from_tuple = uncurry EarlyLineGameWithNote + to_tuple m = (date_note m, date_game m) + + -- | Pickler for the \ elements within each \. -- pickle_date :: PU EarlyLineDate pickle_date = xpElem "date" $ xpWrap (from_tuple, to_tuple) $ - xpTriple (xpAttr "value" xp_early_line_date) - (xpElem "note" xpText) - pickle_game + xpPair (xpAttr "value" xp_early_line_date) + (xpList pickle_game_with_note) where - from_tuple = uncurryN EarlyLineDate - to_tuple m = (date_value m, date_note m, date_game m) + from_tuple = uncurry EarlyLineDate + to_tuple m = (date_value m, date_games_with_notes m) --- | Pickler for the \ element within each \. +-- | Pickler for the \ elements within each \. -- pickle_game :: PU EarlyLineGameXml pickle_game = @@ -332,7 +505,7 @@ pickle_game = xp4Tuple (xpElem "time" xp_ambiguous_time) pickle_away_team pickle_home_team - (xpElem "over_under" xpText) + (xpElem "over_under" (xpOption xpText)) where from_tuple = uncurryN EarlyLineGameXml to_tuple m = (xml_game_time m, @@ -346,7 +519,7 @@ pickle_game = -- of the work (common with the home team pickler) is done by -- 'pickle_team'. -- -pickle_away_team :: PU EarlyLineGameTeam +pickle_away_team :: PU EarlyLineGameTeamXml pickle_away_team = xpElem "teamA" pickle_team @@ -354,23 +527,39 @@ pickle_away_team = xpElem "teamA" pickle_team -- of the work (common with theaway team pickler) is done by -- 'pickle_team'. -- -pickle_home_team :: PU EarlyLineGameTeam +pickle_home_team :: PU EarlyLineGameTeamXml pickle_home_team = xpElem "teamH" pickle_team -- | Team pickling common to both 'pickle_away_team' and -- 'pickle_home_team'. Handles everything inside the \ and --- \ elements. +-- \ elements. We try to parse the line/name as both an +-- attribute and an element in order to accomodate +-- MLB_earlylineXML.dtd. +-- +-- The \"line\" and \"pitcher\" fields wind up being double-Maybes, +-- since they can be empty even if they exist. -- -pickle_team :: PU EarlyLineGameTeam +pickle_team :: PU EarlyLineGameTeamXml pickle_team = xpWrap (from_tuple, to_tuple) $ - xpTriple (xpAttr "rotation" xpInt) - (xpAttr "line" (xpOption xpText)) - xpText + xp6Tuple (xpAttr "rotation" xpInt) + (xpOption $ xpAttr "line" (xpOption xpText)) + (xpOption $ xpAttr "name" xpText) + (xpOption xpText) + (xpOption $ xpElem "pitcher" (xpOption xpText)) + (xpOption $ xpElem "line" (xpOption xpText)) where - from_tuple = uncurryN EarlyLineGameTeam - to_tuple m = (db_rotation_number m, db_line m, db_team_name m) + from_tuple (u,v,w,x,y,z) = + EarlyLineGameTeamXml u (join v) w x (join y) (join z) + + to_tuple (EarlyLineGameTeamXml u v w x y z) = + (u, double_just v, w, x, double_just y, double_just z) + where + double_just val = case val of + Nothing -> Nothing + just_something -> Just just_something +