]> gitweb.michael.orlitzky.com - djbdns-logparse.git/blob - bin/djbdns-logparse.py
bin/djbdns-logparse.py: improve existing decode_* docs.
[djbdns-logparse.git] / bin / djbdns-logparse.py
1 #!/usr/bin/python3
2 r"""
3 Convert tinydns and dnscache logs to human-readable form
4 """
5
6 import re, typing
7 from struct import pack
8
9 ## Regular expressions for matching tinydns/dnscache log lines. We
10 ## compile these once here rather than within the corresponding
11 ## matching functions, because the latter get executed repeatedly.
12
13 # This first pattern is used to match the timestamp format that the
14 # tai64nlocal program produces. It appears in both dnscache and
15 # tinydns lines, after they've been piped through tai64nlocal, of
16 # course.
17 timestamp_pat = r'[\d-]+ [\d:\.]+'
18
19 # The regex to match dnscache log lines.
20 dnscache_log_re = re.compile(fr'({timestamp_pat}) (\w+)(.*)')
21
22 # The "hex4" pattern matches a string of four hexadecimal digits. This
23 # is used, for example, by tinydns to encode the query type
24 # identifier.
25 hex4_pat = r'[0-9a-f]{4}'
26
27 # The IP pattern matches a string of either 8 or 32 hexadecimal
28 # characters, which correspond to IPv4 and IPv6 addresses,
29 # respectively, in tinydns logs.
30 ip_pat = r'[0-9a-f]{8,32}'
31
32 # The regex to match tinydns log lines.
33 tinydns_log_re = re.compile(
34 rf'({timestamp_pat}) ({ip_pat}):({hex4_pat}):({hex4_pat}) ([\+\-IC/]) ({hex4_pat}) (.*)'
35 )
36
37 # A dictionary mapping query type identifiers, in decimal, to their
38 # friendly names for tinydns. Reference:
39 #
40 # https://en.wikipedia.org/wiki/List_of_DNS_record_types
41 #
42 # Note that mapping here is non-exhaustive, and that tinydns will
43 # log responses for record types that it does not know about.
44 query_type = {
45 1: "a",
46 2: "ns",
47 5: "cname",
48 6: "soa",
49 12: "ptr",
50 13: "hinfo",
51 15: "mx",
52 16: "txt",
53 17: "rp",
54 24: "sig",
55 25: "key",
56 28: "aaaa",
57 33: "srv",
58 35: "naptr",
59 38: "a6",
60 48: "dnskey",
61 52: "tlsa",
62 65: "https",
63 252: "axfr",
64 255: "any",
65 257: "caa"
66 }
67
68 # tinydns can drop a query for one of three reasons; this dictionary
69 # maps the symbol that gets logged in each case to a human-readable
70 # reason. We include the "+" case here, indicating that the query was
71 # NOT dropped, to avoid a special case later on when we're formatting
72 # the human-readable output.
73 query_drop_reason = {
74 "+": None,
75 "-": "no authority",
76 "I": "invalid query",
77 "C": "invalid class",
78 "/": "couldn't parse"
79 }
80
81
82 def convert_ip(ip : str) -> str:
83 r"""
84 Convert a hex string representing an IP address to
85 human-readable form.
86
87 Parameters
88 ----------
89
90 ip : str
91 The hexadecimal representation of either an IPv4 or an IPv6
92 address.
93
94 Returns
95 -------
96
97 The usual decimal dotted-quad representation is returned for an
98 IPv4 address. IPv6 addresses are returned almost as-is, but with
99 colons inserted in the appropriate places, between every four
100 characters.
101
102 Examples
103 --------
104
105 >>> convert_ip("7f000001")
106 '127.0.0.1'
107 >>> convert_ip("00000000000000000000ffff7f000001")
108 '0000:0000:0000:0000:0000:ffff:7f00:0001'
109 """
110 if len(ip) == 8:
111 # IPv4, eg. "7f000001" -> "7f 00 00 01" -> "127.0.0.1"
112 return ".".join(map(str, pack(">L", int(ip, 16))))
113 elif len(ip) == 32:
114 # IPv6 is actually simpler -- it's just a string-slicing operation.
115 return ":".join([ip[(4*i) : (4*i+4)] for i in range(8)])
116
117
118 def decode_client(words : list, i : int):
119 r"""
120 Helper function to decode the client field in a dnscache log
121 entry.
122
123 There are two possible formats for the client field,
124
125 1. clientip:clientport, used by tcpopen/tcpclose entries,
126 2. clientip:clientport:id, used by "query" entries.
127
128 We convert each part from hex to decimal, and in the second
129 format, separate the packet id from the client information.
130
131 Parameters
132 ----------
133
134 words : list
135 The ``words`` list (a list of fields) from
136 :func:`handle_dnscache_log`.
137
138 i : int
139 The index of the client field within ``words``
140
141 Returns
142 -------
143
144 Nothing; the ``i``th entry in the ``words`` list is modified
145 in-place.
146
147 Examples
148 --------
149
150 >>> words = ["foo", "bar", "7f000001:9253", "quux"]
151 >>> decode_client(words, 2)
152 >>> words
153 ['foo', 'bar', '127.0.0.1:37459', 'quux']
154
155 >>> words = ["foo", "7f000001:a3db:4fb9", "bar", "quux"]
156 >>> decode_client(words, 1)
157 >>> words
158 ['foo', '127.0.0.1:41947 (id 20409)', 'bar', 'quux']
159
160 """
161 chunks = words[i].split(":")
162
163 ip = convert_ip(chunks[0])
164 port = int(chunks[1], 16)
165 words[i] = f"{ip}:{port}"
166
167 if len(chunks) == 3:
168 # For a "query" entry's clientip:clientport:id field.
169 id = int(chunks[2], 16)
170 words[i] += f" (id {id})"
171
172 def decode_ip(words : list, i : int):
173 r"""
174 Helper function to decode the ip field in a dnscache log
175 entry.
176
177 A single "serverip" field is present in the lame, nodata,
178 nxdomain, and rr entry types. We convert it from hex to decimal.
179
180 Parameters
181 ----------
182
183 words : list
184 The ``words`` list (a list of fields) from
185 :func:`handle_dnscache_log`.
186
187 i : int
188 The index of the ip field within ``words``
189
190 Returns
191 -------
192
193 Nothing; the ``i``th entry in the ``words`` list is modified
194 in-place.
195
196 Examples
197 --------
198
199 >>> words = ["foo", "bar", "7f000001", "quux"]
200 >>> decode_ip(words, 2)
201 >>> words
202 ['foo', 'bar', '127.0.0.1', 'quux']
203
204 >>> words = ["foo", "00000000000000000000ffff7f000001", "bar", "quux"]
205 >>> decode_ip(words, 1)
206 >>> words
207 ['foo', '0000:0000:0000:0000:0000:ffff:7f00:0001', 'bar', 'quux']
208 """
209 words[i] = convert_ip(words[i])
210
211 def decode_ttl(words : list, i : int):
212 r"""
213 Helper function to decode the ttl field in a dnscache log
214 entry.
215
216 A single "ttl" field is present in the nodata, nxdomain, and
217 rr entry types. We prefix it with "TTL=" so that its meaning
218 is clear in the human-readable logs.
219
220 Parameters
221 ----------
222
223 words : list
224 The ``words`` list (a list of fields) from
225 :func:`handle_dnscache_log`.
226
227 i : int
228 The index of the ttl field within ``words``
229
230 Returns
231 -------
232
233 Nothing; the ``i``th entry in the ``words`` list is modified
234 in-place.
235
236 Examples
237 --------
238
239 >>> words = ["c0a80101", "20865", "1", "www.example.com.", "5db8d822"]
240 >>> decode_ttl(words, 1)
241 >>> words
242 ['c0a80101', 'TTL=20865', '1', 'www.example.com.', '5db8d822']
243
244 """
245 words[i] = f"TTL={words[i]}"
246
247 def decode_serial(words : list, i : int):
248 serial = int(words[i])
249 words[i] = f"#{serial}"
250
251 def decode_type(words : list, i : int):
252 qt = words[i]
253 words[i] = query_type.get(int(qt), qt)
254
255 def handle_dnscache_log(line : str) -> typing.Optional[str]:
256 r"""
257 Handle a single log line if it matches the ``dnscache_log_re`` regex.
258
259 Parameters
260 ----------
261
262 line : string
263 The log line that might match ``dnscache_log_re``.
264
265 Returns
266 -------
267
268 Either the human-readable string if the log line was handled (that
269 is, if it was really a dnscache log line), or ``None`` if it was
270 not.
271
272 Examples
273 --------
274
275 >>> line = "2022-09-15 18:37:33.863805500 query 1 7f000001:a3db:4fb9 1 www.example.com."
276 >>> handle_dnscache_log(line)
277 '2022-09-15 18:37:33.863805500 query #1 127.0.0.1:41947 (id 20409) a www.example.com.'
278
279 >>> line = "2022-09-15 18:37:33.863874500 tx 0 1 www.example.com. . c0a80101"
280 >>> handle_dnscache_log(line)
281 '2022-09-15 18:37:33.863874500 tx g=0 a www.example.com. . 192.168.1.1'
282
283 >>> line = "2022-09-15 18:37:33.878529500 rr c0a80101 20865 1 www.example.com. 5db8d822"
284 >>> handle_dnscache_log(line)
285 '2022-09-15 18:37:33.878529500 rr 192.168.1.1 TTL=20865 a www.example.com. 93.184.216.34'
286
287 >>> line = "2022-09-15 18:37:33.878532500 stats 1 43 1 0"
288 >>> handle_dnscache_log(line)
289 '2022-09-15 18:37:33.878532500 stats count=1 motion=43 udp-active=1 tcp-active=0'
290
291 >>> line = "2022-09-15 18:37:33.878602500 sent 1 49"
292 >>> handle_dnscache_log(line)
293 '2022-09-15 18:37:33.878602500 sent #1 49'
294
295 >>> line = "this line is nonsense"
296 >>> handle_dnscache_log(line)
297
298 """
299 match = dnscache_log_re.match(line)
300 if not match:
301 return None
302
303 (timestamp, event, data) = match.groups()
304
305 words = data.split()
306 if event == "cached":
307 if words[0] not in ("cname", "ns", "nxdomain"):
308 decode_type(words, 0)
309
310 elif event == "drop":
311 decode_serial(words, 0)
312
313 elif event == "lame":
314 decode_ip(words, 0)
315
316 elif event == "nodata":
317 decode_ip(words, 0)
318 decode_ttl(words, 1)
319 decode_type(words, 2)
320
321 elif event == "nxdomain":
322 decode_ip(words, 0)
323 decode_ttl(words, 1)
324
325 elif event == "query":
326 decode_serial(words, 0)
327 decode_client(words, 1)
328 decode_type(words, 2)
329
330 elif event == "rr":
331 decode_ip(words, 0)
332 decode_ttl(words, 1)
333 if words[2] not in ("cname", "mx", "ns", "ptr", "soa"):
334 decode_type(words, 2)
335 if words[2] == "a": # decode answer to an A query
336 decode_ip(words, 4)
337 if words[2] == "txt": # text record
338 response = words[4]
339 if response.endswith("..."):
340 ellipsis = "..."
341 response = response[0:-3]
342 else:
343 ellipsis = ""
344 length = int(response[0:2], 16)
345 chars = []
346 for i in range(1, len(response)//2):
347 chars.append(chr(int(response[2*i : (2*i)+2], 16)))
348 txt = "".join(chars)
349 words[4] = f"{length}:\"{txt}{ellipsis}\""
350
351 elif event == "sent":
352 decode_serial(words, 0)
353
354 elif event == "stats":
355 words[0] = f"count={words[0]}"
356 words[1] = f"motion={words[1]}"
357 words[2] = f"udp-active={words[2]}"
358 words[3] = f"tcp-active={words[3]}"
359
360 elif event == "tx":
361 words[0] = f"g={words[0]}"
362 decode_type(words, 1)
363 # words[2] = name
364 # words[3] = control (domain for which these servers are believed
365 # to be authoritative)
366 for i in range(4, len(words)):
367 decode_ip(words, i)
368
369 elif event in ("tcpopen", "tcpclose"):
370 decode_client(words, 0)
371
372 # Reconstitute "data" (i.e. everything after the timestamp and the
373 # event) from "words", which was originally obtained by splitting
374 # "data".
375 data = " ".join(words)
376 return f"{timestamp} {event} {data}"
377
378
379
380 def handle_tinydns_log(line : str) -> typing.Optional[str]:
381 r"""
382 Handle a single log line if it matches the ``tinydns_log_re`` regex.
383
384 Parameters
385 ----------
386
387 line : string
388 The log line that might match ``tinydns_log_re``.
389
390 Returns
391 -------
392
393 Either the human-readable string if the log line was handled (that
394 is, if it was really a tinydns log line), or ``None`` if it was
395 not.
396
397 Examples
398 --------
399
400 >>> line = "2022-09-14 21:04:40.206516500 7f000001:9d61:be69 - 0001 www.example.com"
401 >>> handle_tinydns_log(line)
402 '2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com'
403
404 >>> line = "this line is nonsense"
405 >>> handle_tinydns_log(line)
406
407 """
408 match = tinydns_log_re.match(line)
409 if not match:
410 return None
411
412 (timestamp, ip, port, id, code, type, name) = match.groups()
413 ip = convert_ip(ip)
414 port = int(port, 16)
415 id = int(id, 16)
416
417 # Convert the "type" field to a human-readable record type name
418 # using the query_type dictionary. If the right name isn't present
419 # in the dictionary, we use the (decimal) type id instead.
420 type = int(type, 16) # "001c" -> 28
421 type = query_type.get(type, type) # 28 -> "aaaa"
422
423 line_tpl = "{timestamp} "
424
425 reason = query_drop_reason[code]
426 if code == "+":
427 line_tpl += "sent response to {ip}:{port} (id {id}): {type} {name}"
428 else:
429 line_tpl += "dropped query ({reason}) from {ip}:{port}"
430 if code != "/":
431 # If the query can actually be parsed, the log line is a
432 # bit more informative than it would have been otherwise.
433 line_tpl += " (id {id}): {type} {name}"
434
435 return line_tpl.format(timestamp=timestamp,
436 reason=reason,
437 ip=ip,
438 port=port,
439 id=id,
440 type=type,
441 name=name)
442
443
444 def parse_logfile(file : typing.TextIO):
445 r"""
446 Process a single log ``file``.
447
448 Parameters
449 ----------
450
451 file : typing.TextIO
452 An open log file, or stdin.
453
454 Examples
455 --------
456
457 >>> line = "@4000000063227a320c4f3114 7f000001:9d61:be69 - 0001 www.example.com\n"
458 >>> from tempfile import NamedTemporaryFile
459 >>> with NamedTemporaryFile(mode="w", delete=False) as f:
460 ... _ = f.write(line)
461 >>> f = open(f.name, 'r')
462 >>> parse_logfile(f)
463 2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com
464 >>> f.close()
465 >>> from os import remove
466 >>> remove(f.name)
467
468 """
469 # Open a pipe to tai64nlocal. We'll write lines of our input file
470 # (the log file) to it, and read back the same lines but with
471 # friendly timestamps in them.
472 from subprocess import Popen, PIPE
473 tai = Popen(["tai64nlocal"], stdin=PIPE, stdout=PIPE, text=True, bufsize=0)
474
475 for line in file:
476 tai.stdin.write(line)
477 line = tai.stdout.readline()
478
479 friendly_line = handle_tinydns_log(line)
480 if not friendly_line:
481 friendly_line = handle_dnscache_log(line)
482 if not friendly_line:
483 friendly_line = line
484
485 print(friendly_line)
486
487 def main():
488 r"""
489 The entry point to the program.
490
491 This function is responsible only for parsing any command-line
492 arguments, and then calling :func`parse_logfile` on them.
493 """
494 # Create an argument parser using the file's docsctring as its
495 # description.
496 from argparse import ArgumentParser, FileType
497 parser = ArgumentParser(description = __doc__)
498
499 # Parse zero or more positional arguments into a list of
500 # "logfiles". If none are given, read from stdin instead.
501 from sys import stdin
502 parser.add_argument("logfiles",
503 metavar="LOGFILE",
504 type=FileType("r"),
505 nargs="*",
506 default=[stdin],
507 help="djbdns logfile to process (default: stdin)")
508
509 # Warning: argparse automatically opens its file arguments here,
510 # and they only get closed when the program terminates. There's no
511 # real benefit to closing them one-at-a-time after calling
512 # parse_logfile(), because the "scarce" resource of open file
513 # descriptors gets consumed immediately, before any processing has
514 # happened. In other words, if you're going to run out of file
515 # descriptors, it's going to happen right now.
516 #
517 # So anyway, don't run this on several million logfiles.
518 args = parser.parse_args()
519 for f in args.logfiles:
520 parse_logfile(f)
521
522
523 if __name__ == "__main__":
524 main()