]> gitweb.michael.orlitzky.com - dead/htsn-import.git/blob - src/Main.hs
Rewrite everything to use XmlImport/DbImport classes making things much more easy...
[dead/htsn-import.git] / src / Main.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2 {-# LANGUAGE NoMonomorphismRestriction #-}
3 module Main
4 where
5
6 import Control.Arrow ( (&&&), (>>^), arr, returnA )
7 import Control.Concurrent ( threadDelay )
8 import Control.Exception ( SomeException, catch )
9 import Control.Monad ( when )
10 import Database.Groundhog.Generic ( runDbConn )
11 import Database.Groundhog.Sqlite (
12 withSqliteConn )
13 import Database.Groundhog.Postgresql (
14 withPostgresqlConn )
15 import Data.Monoid ( (<>) )
16 import Network.Services.TSN.Logging ( init_logging )
17 import System.Console.CmdArgs ( def )
18 import System.Directory ( removeFile )
19 import System.Exit ( exitWith, ExitCode (ExitFailure) )
20 import System.IO.Error ( catchIOError )
21 import Text.XML.HXT.Core (
22 ArrowXml,
23 IOStateArrow,
24 XmlTree,
25 (>>>),
26 (/>),
27 getAttrl,
28 getText,
29 hasName,
30 readDocument,
31 runX,
32 unpickleDoc,
33 xpickle)
34
35 import Backend ( Backend(..) )
36 import CommandLine ( get_args )
37 import Configuration ( Configuration(..), merge_optional )
38 import ConnectionString ( ConnectionString(..) )
39 import ExitCodes ( exit_no_xml_files )
40 import qualified OptionalConfiguration as OC (
41 OptionalConfiguration ( xml_files ),
42 from_rc )
43 import Network.Services.TSN.Report (
44 report_info,
45 report_error )
46 import TSN.DbImport ( DbImport(..), ImportResult(..) )
47 import qualified TSN.XML.Heartbeat as Heartbeat ( verify )
48 import qualified TSN.XML.Injuries as Injuries ( Message )
49 import qualified TSN.XML.InjuriesDetail as InjuriesDetail ( Message )
50 import qualified TSN.XML.News as News ( Message )
51 --import qualified TSN.XML.Odds as Odds ( Odds )
52 import Xml ( DtdName(..), parse_opts )
53
54
55 -- | This is where most of the work happens. This function is called
56 -- on every file that we would like to import. It determines which
57 -- importer to use based on the DTD, processes the file, and then
58 -- returns whether or not any records were imported. If the file was
59 -- processed, the number of records imported is returned (wrapped in
60 -- a Just). Otherwise, if the file was not processed, 'Nothing' is
61 -- returned.
62 --
63 -- Since we are already in arrow world with HXT, the
64 -- 'import_with_dtd' function is lifted to an 'Arrow' as well with
65 -- 'arr'. This prevents us from having to do a bunch of unwrapping
66 -- and rewrapping with the associated error checking.
67 --
68 import_file :: Configuration -- ^ A configuration object needed for the
69 -- 'backend' and 'connection_string'.
70
71 -> FilePath -- ^ The path of the XML file to import.
72
73 -> IO Bool -- ^ True if we processed the file, False otherwise.
74 import_file cfg path = do
75 results <- parse_and_import `catch` exception_handler
76 case results of
77 [] -> do
78 -- One of the arrows returned "nothing."
79 report_error $ "Unable to determine DTD for file " ++ path ++ "."
80 return False
81 (ImportFailed errmsg:_) -> do
82 report_error errmsg
83 return False
84 (ImportSkipped infomsg:_) -> do
85 -- We processed the message but didn't import anything. Return
86 -- "success" so that the XML file is deleted.
87 report_info infomsg
88 return True
89 (ImportSucceeded:_) -> do
90 report_info $ "Successfully imported " ++ path ++ "."
91 return True
92 (ImportUnsupported infomsg:_) -> do
93 -- For now we return "success" for these too, since we know we don't
94 -- support a bunch of DTDs and we want them to get deleted.
95 report_info infomsg
96 return True
97 where
98 -- | This will catch *any* exception, even the ones thrown by
99 -- Haskell's 'error' (which should never occur under normal
100 -- circumstances).
101 exception_handler :: SomeException -> IO [ImportResult]
102 exception_handler e = do
103 report_error (show e)
104 let errdesc = "Failed to import file " ++ path ++ "."
105 -- Return a nonempty list so we don't claim incorrectly that
106 -- we couldn't parse the DTD.
107 return [ImportFailed errdesc]
108
109 -- | An arrow that reads a document into an 'XmlTree'.
110 readA :: IOStateArrow s a XmlTree
111 readA = readDocument parse_opts path
112
113 -- | An arrow which parses the doctype "SYSTEM" of an 'XmlTree'.
114 -- We use these to determine the parser to use.
115 dtdnameA :: ArrowXml a => a XmlTree DtdName
116 dtdnameA = getAttrl >>> hasName "doctype-SYSTEM" /> getText >>^ DtdName
117
118 -- | Combine the arrows above as well as the function below
119 -- (arrowized with 'arr') into an IO action that does everything
120 -- (parses and then runs the import on what was parsed).
121 --
122 -- The result of runX has type IO [IO ImportResult]. We thus use
123 -- bind (>>=) and sequence to combine all of the IOs into one
124 -- big one outside of the list.
125 parse_and_import :: IO [ImportResult]
126 parse_and_import =
127 runX (readA >>> (dtdnameA &&& returnA) >>> (arr import_with_dtd))
128 >>=
129 sequence
130
131 -- | Takes a ('DtdName', 'XmlTree') pair and uses the 'DtdName'
132 -- to determine which function to call on the 'XmlTree'.
133 import_with_dtd :: (DtdName, XmlTree) -> IO ImportResult
134 import_with_dtd (DtdName dtd,xml)
135 -- We special-case the heartbeat so it doesn't have to run in
136 -- the database monad.
137 | dtd == "Heartbeat.dtd" = Heartbeat.verify xml
138 | otherwise = do
139 -- We need NoMonomorphismRestriction here.
140 if backend cfg == Postgres
141 then withPostgresqlConn cs $ runDbConn $ importer
142 else withSqliteConn cs $ runDbConn $ importer
143 where
144 -- | Pull the real connection String out of the configuration.
145 cs :: String
146 cs = get_connection_string $ connection_string cfg
147
148 -- | Convenience; we use this everywhere below in 'importer'.
149 migrate_and_import m = dbmigrate m >> dbimport m
150
151 importer
152 | dtd == "injuriesxml.dtd" = do
153 let m = unpickleDoc xpickle xml :: Maybe Injuries.Message
154 let errmsg = "Could not unpickle injuriesxml."
155 maybe (return $ ImportFailed errmsg) migrate_and_import m
156
157 | dtd == "Injuries_Detail_XML.dtd" = do
158 let m = unpickleDoc xpickle xml :: Maybe InjuriesDetail.Message
159 let errmsg = "Could not unpickle Injuries_Detail_XML."
160 maybe (return $ ImportFailed errmsg) migrate_and_import m
161
162
163 | dtd == "newsxml.dtd" = do
164 let m = unpickleDoc xpickle xml :: Maybe News.Message
165 let errmsg = "Could not unpickle newsxml."
166 maybe (return $ ImportFailed errmsg) migrate_and_import m
167
168 -- | dtd == "Odds_XML.dtd" = undefined
169
170 | otherwise = do
171 let infomsg =
172 "Unrecognized DTD in " ++ path ++ ": " ++ dtd ++ "."
173 return $ ImportUnsupported infomsg
174
175
176 -- | Entry point of the program. It twiddles some knobs for
177 -- configuration options and then calls 'import_file' on each XML file
178 -- given on the command-line.
179 --
180 -- Any file successfully processed is then removed, and we're done.
181 --
182 main :: IO ()
183 main = do
184 rc_cfg <- OC.from_rc
185 cmd_cfg <- get_args
186
187 -- Merge the config file options with the command-line ones,
188 -- prefering the command-line ones.
189 let opt_config = rc_cfg <> cmd_cfg
190
191 -- Update a default config with any options that have been set in
192 -- either the config file or on the command-line. We initialize
193 -- logging before the missing parameter checks below so that we can
194 -- log the errors.
195 let cfg = (def :: Configuration) `merge_optional` opt_config
196 init_logging (log_level cfg) (log_file cfg) (syslog cfg)
197
198 -- Check the optional config for missing required options.
199 when (null $ OC.xml_files opt_config) $ do
200 report_error "No XML files given."
201 exitWith (ExitFailure exit_no_xml_files)
202
203 -- We don't do this in parallel (for now?) to keep the error
204 -- messages nice and linear.
205 results <- mapM (import_file cfg) (OC.xml_files opt_config)
206
207 -- Zip the results with the files list to find out which ones can be
208 -- deleted.
209 let result_pairs = zip (OC.xml_files opt_config) results
210 let victims = [ p | (p, True) <- result_pairs ]
211 let imported_count = length victims
212 report_info $ "Imported " ++ (show imported_count) ++ " document(s) total."
213 mapM_ (kill True) victims
214
215 where
216 -- | Wrap these two actions into one function so that we don't
217 -- report that the file was removed if the exception handler is
218 -- run.
219 remove_and_report path = do
220 removeFile path
221 report_info $ "Removed processed file " ++ path ++ "."
222
223 -- | Try to remove @path@ and potentially try again.
224 kill try_again path =
225 (remove_and_report path) `catchIOError` exception_handler
226 where
227 -- | A wrapper around threadDelay which takes seconds instead of
228 -- microseconds as its argument.
229 thread_sleep :: Int -> IO ()
230 thread_sleep seconds = do
231 let microseconds = seconds * (10 ^ (6 :: Int))
232 threadDelay microseconds
233
234 -- | If we can't remove the file, report that, and try once
235 -- more after waiting a few seconds.
236 exception_handler :: IOError -> IO ()
237 exception_handler e = do
238 report_error (show e)
239 report_error $ "Failed to remove imported file " ++ path ++ "."
240 if try_again then do
241 report_info "Waiting 5 seconds to attempt removal again..."
242 thread_sleep 5
243 kill False path
244 else
245 report_info $ "Giving up on " ++ path ++ "."