]> gitweb.michael.orlitzky.com - djbdns-logparse.git/blob - bin/djbdns-logparse.py
bin/djbdns-logparse.py: add a dosctring for decode_serial().
[djbdns-logparse.git] / bin / djbdns-logparse.py
1 #!/usr/bin/python3
2 r"""
3 Convert tinydns and dnscache logs to human-readable form
4 """
5
6 import re, typing
7 from struct import pack
8
9 ## Regular expressions for matching tinydns/dnscache log lines. We
10 ## compile these once here rather than within the corresponding
11 ## matching functions, because the latter get executed repeatedly.
12
13 # This first pattern is used to match the timestamp format that the
14 # tai64nlocal program produces. It appears in both dnscache and
15 # tinydns lines, after they've been piped through tai64nlocal, of
16 # course.
17 timestamp_pat = r'[\d-]+ [\d:\.]+'
18
19 # The regex to match dnscache log lines.
20 dnscache_log_re = re.compile(fr'({timestamp_pat}) (\w+)(.*)')
21
22 # The "hex4" pattern matches a string of four hexadecimal digits. This
23 # is used, for example, by tinydns to encode the query type
24 # identifier.
25 hex4_pat = r'[0-9a-f]{4}'
26
27 # The IP pattern matches a string of either 8 or 32 hexadecimal
28 # characters, which correspond to IPv4 and IPv6 addresses,
29 # respectively, in tinydns logs.
30 ip_pat = r'[0-9a-f]{8,32}'
31
32 # The regex to match tinydns log lines.
33 tinydns_log_re = re.compile(
34 rf'({timestamp_pat}) ({ip_pat}):({hex4_pat}):({hex4_pat}) ([\+\-IC/]) ({hex4_pat}) (.*)'
35 )
36
37 # A dictionary mapping query type identifiers, in decimal, to their
38 # friendly names for tinydns. Reference:
39 #
40 # https://en.wikipedia.org/wiki/List_of_DNS_record_types
41 #
42 # Note that mapping here is non-exhaustive, and that tinydns will
43 # log responses for record types that it does not know about.
44 query_type = {
45 1: "a",
46 2: "ns",
47 5: "cname",
48 6: "soa",
49 12: "ptr",
50 13: "hinfo",
51 15: "mx",
52 16: "txt",
53 17: "rp",
54 24: "sig",
55 25: "key",
56 28: "aaaa",
57 33: "srv",
58 35: "naptr",
59 38: "a6",
60 48: "dnskey",
61 52: "tlsa",
62 65: "https",
63 252: "axfr",
64 255: "any",
65 257: "caa"
66 }
67
68 # tinydns can drop a query for one of three reasons; this dictionary
69 # maps the symbol that gets logged in each case to a human-readable
70 # reason. We include the "+" case here, indicating that the query was
71 # NOT dropped, to avoid a special case later on when we're formatting
72 # the human-readable output.
73 query_drop_reason = {
74 "+": None,
75 "-": "no authority",
76 "I": "invalid query",
77 "C": "invalid class",
78 "/": "couldn't parse"
79 }
80
81
82 def convert_ip(ip : str) -> str:
83 r"""
84 Convert a hex string representing an IP address to
85 human-readable form.
86
87 Parameters
88 ----------
89
90 ip : str
91 The hexadecimal representation of either an IPv4 or an IPv6
92 address.
93
94 Returns
95 -------
96
97 The usual decimal dotted-quad representation is returned for an
98 IPv4 address. IPv6 addresses are returned almost as-is, but with
99 colons inserted in the appropriate places, between every four
100 characters.
101
102 Examples
103 --------
104
105 >>> convert_ip("7f000001")
106 '127.0.0.1'
107 >>> convert_ip("00000000000000000000ffff7f000001")
108 '0000:0000:0000:0000:0000:ffff:7f00:0001'
109 """
110 if len(ip) == 8:
111 # IPv4, eg. "7f000001" -> "7f 00 00 01" -> "127.0.0.1"
112 return ".".join(map(str, pack(">L", int(ip, 16))))
113 elif len(ip) == 32:
114 # IPv6 is actually simpler -- it's just a string-slicing operation.
115 return ":".join([ip[(4*i) : (4*i+4)] for i in range(8)])
116
117
118 def decode_client(words : list, i : int):
119 r"""
120 Helper function to decode the client field in a dnscache log
121 entry.
122
123 There are two possible formats for the client field,
124
125 1. clientip:clientport, used by tcpopen/tcpclose entries,
126 2. clientip:clientport:id, used by "query" entries.
127
128 We convert each part from hex to decimal, and in the second
129 format, separate the packet id from the client information.
130
131 Parameters
132 ----------
133
134 words : list
135 The ``words`` list (a list of fields) from
136 :func:`handle_dnscache_log`.
137
138 i : int
139 The index of the client field within ``words``
140
141 Returns
142 -------
143
144 Nothing; the ``i``th entry in the ``words`` list is modified
145 in-place.
146
147 Examples
148 --------
149
150 >>> words = ["foo", "bar", "7f000001:9253", "quux"]
151 >>> decode_client(words, 2)
152 >>> words
153 ['foo', 'bar', '127.0.0.1:37459', 'quux']
154
155 >>> words = ["foo", "7f000001:a3db:4fb9", "bar", "quux"]
156 >>> decode_client(words, 1)
157 >>> words
158 ['foo', '127.0.0.1:41947 (id 20409)', 'bar', 'quux']
159
160 """
161 chunks = words[i].split(":")
162
163 ip = convert_ip(chunks[0])
164 port = int(chunks[1], 16)
165 words[i] = f"{ip}:{port}"
166
167 if len(chunks) == 3:
168 # For a "query" entry's clientip:clientport:id field.
169 id = int(chunks[2], 16)
170 words[i] += f" (id {id})"
171
172 def decode_ip(words : list, i : int):
173 r"""
174 Helper function to decode the ip field in a dnscache log
175 entry.
176
177 A single "serverip" field is present in the lame, nodata,
178 nxdomain, and rr entry types. We convert it from hex to decimal.
179
180 Parameters
181 ----------
182
183 words : list
184 The ``words`` list (a list of fields) from
185 :func:`handle_dnscache_log`.
186
187 i : int
188 The index of the ip field within ``words``
189
190 Returns
191 -------
192
193 Nothing; the ``i``th entry in the ``words`` list is modified
194 in-place.
195
196 Examples
197 --------
198
199 >>> words = ["foo", "bar", "7f000001", "quux"]
200 >>> decode_ip(words, 2)
201 >>> words
202 ['foo', 'bar', '127.0.0.1', 'quux']
203
204 >>> words = ["foo", "00000000000000000000ffff7f000001", "bar", "quux"]
205 >>> decode_ip(words, 1)
206 >>> words
207 ['foo', '0000:0000:0000:0000:0000:ffff:7f00:0001', 'bar', 'quux']
208 """
209 words[i] = convert_ip(words[i])
210
211 def decode_ttl(words : list, i : int):
212 r"""
213 Helper function to decode the ttl field in a dnscache log
214 entry.
215
216 A single "ttl" field is present in the nodata, nxdomain, and
217 rr entry types. We prefix it with "TTL=" so that its meaning
218 is clear in the human-readable logs.
219
220 Parameters
221 ----------
222
223 words : list
224 The ``words`` list (a list of fields) from
225 :func:`handle_dnscache_log`.
226
227 i : int
228 The index of the ttl field within ``words``
229
230 Returns
231 -------
232
233 Nothing; the ``i``th entry in the ``words`` list is modified
234 in-place.
235
236 Examples
237 --------
238
239 >>> words = ["c0a80101", "20865", "1", "www.example.com.", "5db8d822"]
240 >>> decode_ttl(words, 1)
241 >>> words
242 ['c0a80101', 'TTL=20865', '1', 'www.example.com.', '5db8d822']
243
244 """
245 words[i] = f"TTL={words[i]}"
246
247 def decode_serial(words : list, i : int):
248 r"""
249 Helper function to decode the serial field in a dnscache log
250 entry.
251
252 A single "serial" field is present in the drop and query entry
253 types. It's already in decimal; we simply prefix it with a hash.
254
255 Parameters
256 ----------
257
258 words : list
259 The ``words`` list (a list of fields) from
260 :func:`handle_dnscache_log`.
261
262 i : int
263 The index of the serial field within ``words``
264
265 Returns
266 -------
267
268 Nothing; the ``i``th entry in the ``words`` list is modified
269 in-place.
270
271 Examples
272 --------
273
274 >>> words = ["1", "7f000001:a3db:4fb9", "1", "www.example.com."]
275 >>> decode_serial(words, 0)
276 >>> words
277 ['#1', '7f000001:a3db:4fb9', '1', 'www.example.com.']
278
279 """
280 words[i] = f"#{words[i]}"
281
282 def decode_type(words : list, i : int):
283 qt = words[i]
284 words[i] = query_type.get(int(qt), qt)
285
286 def handle_dnscache_log(line : str) -> typing.Optional[str]:
287 r"""
288 Handle a single log line if it matches the ``dnscache_log_re`` regex.
289
290 Parameters
291 ----------
292
293 line : string
294 The log line that might match ``dnscache_log_re``.
295
296 Returns
297 -------
298
299 Either the human-readable string if the log line was handled (that
300 is, if it was really a dnscache log line), or ``None`` if it was
301 not.
302
303 Examples
304 --------
305
306 >>> line = "2022-09-15 18:37:33.863805500 query 1 7f000001:a3db:4fb9 1 www.example.com."
307 >>> handle_dnscache_log(line)
308 '2022-09-15 18:37:33.863805500 query #1 127.0.0.1:41947 (id 20409) a www.example.com.'
309
310 >>> line = "2022-09-15 18:37:33.863874500 tx 0 1 www.example.com. . c0a80101"
311 >>> handle_dnscache_log(line)
312 '2022-09-15 18:37:33.863874500 tx g=0 a www.example.com. . 192.168.1.1'
313
314 >>> line = "2022-09-15 18:37:33.878529500 rr c0a80101 20865 1 www.example.com. 5db8d822"
315 >>> handle_dnscache_log(line)
316 '2022-09-15 18:37:33.878529500 rr 192.168.1.1 TTL=20865 a www.example.com. 93.184.216.34'
317
318 >>> line = "2022-09-15 18:37:33.878532500 stats 1 43 1 0"
319 >>> handle_dnscache_log(line)
320 '2022-09-15 18:37:33.878532500 stats count=1 motion=43 udp-active=1 tcp-active=0'
321
322 >>> line = "2022-09-15 18:37:33.878602500 sent 1 49"
323 >>> handle_dnscache_log(line)
324 '2022-09-15 18:37:33.878602500 sent #1 49'
325
326 >>> line = "this line is nonsense"
327 >>> handle_dnscache_log(line)
328
329 """
330 match = dnscache_log_re.match(line)
331 if not match:
332 return None
333
334 (timestamp, event, data) = match.groups()
335
336 words = data.split()
337 if event == "cached":
338 if words[0] not in ("cname", "ns", "nxdomain"):
339 decode_type(words, 0)
340
341 elif event == "drop":
342 decode_serial(words, 0)
343
344 elif event == "lame":
345 decode_ip(words, 0)
346
347 elif event == "nodata":
348 decode_ip(words, 0)
349 decode_ttl(words, 1)
350 decode_type(words, 2)
351
352 elif event == "nxdomain":
353 decode_ip(words, 0)
354 decode_ttl(words, 1)
355
356 elif event == "query":
357 decode_serial(words, 0)
358 decode_client(words, 1)
359 decode_type(words, 2)
360
361 elif event == "rr":
362 decode_ip(words, 0)
363 decode_ttl(words, 1)
364 if words[2] not in ("cname", "mx", "ns", "ptr", "soa"):
365 decode_type(words, 2)
366 if words[2] == "a": # decode answer to an A query
367 decode_ip(words, 4)
368 if words[2] == "txt": # text record
369 response = words[4]
370 if response.endswith("..."):
371 ellipsis = "..."
372 response = response[0:-3]
373 else:
374 ellipsis = ""
375 length = int(response[0:2], 16)
376 chars = []
377 for i in range(1, len(response)//2):
378 chars.append(chr(int(response[2*i : (2*i)+2], 16)))
379 txt = "".join(chars)
380 words[4] = f"{length}:\"{txt}{ellipsis}\""
381
382 elif event == "sent":
383 decode_serial(words, 0)
384
385 elif event == "stats":
386 words[0] = f"count={words[0]}"
387 words[1] = f"motion={words[1]}"
388 words[2] = f"udp-active={words[2]}"
389 words[3] = f"tcp-active={words[3]}"
390
391 elif event == "tx":
392 words[0] = f"g={words[0]}"
393 decode_type(words, 1)
394 # words[2] = name
395 # words[3] = control (domain for which these servers are believed
396 # to be authoritative)
397 for i in range(4, len(words)):
398 decode_ip(words, i)
399
400 elif event in ("tcpopen", "tcpclose"):
401 decode_client(words, 0)
402
403 # Reconstitute "data" (i.e. everything after the timestamp and the
404 # event) from "words", which was originally obtained by splitting
405 # "data".
406 data = " ".join(words)
407 return f"{timestamp} {event} {data}"
408
409
410
411 def handle_tinydns_log(line : str) -> typing.Optional[str]:
412 r"""
413 Handle a single log line if it matches the ``tinydns_log_re`` regex.
414
415 Parameters
416 ----------
417
418 line : string
419 The log line that might match ``tinydns_log_re``.
420
421 Returns
422 -------
423
424 Either the human-readable string if the log line was handled (that
425 is, if it was really a tinydns log line), or ``None`` if it was
426 not.
427
428 Examples
429 --------
430
431 >>> line = "2022-09-14 21:04:40.206516500 7f000001:9d61:be69 - 0001 www.example.com"
432 >>> handle_tinydns_log(line)
433 '2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com'
434
435 >>> line = "this line is nonsense"
436 >>> handle_tinydns_log(line)
437
438 """
439 match = tinydns_log_re.match(line)
440 if not match:
441 return None
442
443 (timestamp, ip, port, id, code, type, name) = match.groups()
444 ip = convert_ip(ip)
445 port = int(port, 16)
446 id = int(id, 16)
447
448 # Convert the "type" field to a human-readable record type name
449 # using the query_type dictionary. If the right name isn't present
450 # in the dictionary, we use the (decimal) type id instead.
451 type = int(type, 16) # "001c" -> 28
452 type = query_type.get(type, type) # 28 -> "aaaa"
453
454 line_tpl = "{timestamp} "
455
456 reason = query_drop_reason[code]
457 if code == "+":
458 line_tpl += "sent response to {ip}:{port} (id {id}): {type} {name}"
459 else:
460 line_tpl += "dropped query ({reason}) from {ip}:{port}"
461 if code != "/":
462 # If the query can actually be parsed, the log line is a
463 # bit more informative than it would have been otherwise.
464 line_tpl += " (id {id}): {type} {name}"
465
466 return line_tpl.format(timestamp=timestamp,
467 reason=reason,
468 ip=ip,
469 port=port,
470 id=id,
471 type=type,
472 name=name)
473
474
475 def parse_logfile(file : typing.TextIO):
476 r"""
477 Process a single log ``file``.
478
479 Parameters
480 ----------
481
482 file : typing.TextIO
483 An open log file, or stdin.
484
485 Examples
486 --------
487
488 >>> line = "@4000000063227a320c4f3114 7f000001:9d61:be69 - 0001 www.example.com\n"
489 >>> from tempfile import NamedTemporaryFile
490 >>> with NamedTemporaryFile(mode="w", delete=False) as f:
491 ... _ = f.write(line)
492 >>> f = open(f.name, 'r')
493 >>> parse_logfile(f)
494 2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com
495 >>> f.close()
496 >>> from os import remove
497 >>> remove(f.name)
498
499 """
500 # Open a pipe to tai64nlocal. We'll write lines of our input file
501 # (the log file) to it, and read back the same lines but with
502 # friendly timestamps in them.
503 from subprocess import Popen, PIPE
504 tai = Popen(["tai64nlocal"], stdin=PIPE, stdout=PIPE, text=True, bufsize=0)
505
506 for line in file:
507 tai.stdin.write(line)
508 line = tai.stdout.readline()
509
510 friendly_line = handle_tinydns_log(line)
511 if not friendly_line:
512 friendly_line = handle_dnscache_log(line)
513 if not friendly_line:
514 friendly_line = line
515
516 print(friendly_line)
517
518 def main():
519 r"""
520 The entry point to the program.
521
522 This function is responsible only for parsing any command-line
523 arguments, and then calling :func`parse_logfile` on them.
524 """
525 # Create an argument parser using the file's docsctring as its
526 # description.
527 from argparse import ArgumentParser, FileType
528 parser = ArgumentParser(description = __doc__)
529
530 # Parse zero or more positional arguments into a list of
531 # "logfiles". If none are given, read from stdin instead.
532 from sys import stdin
533 parser.add_argument("logfiles",
534 metavar="LOGFILE",
535 type=FileType("r"),
536 nargs="*",
537 default=[stdin],
538 help="djbdns logfile to process (default: stdin)")
539
540 # Warning: argparse automatically opens its file arguments here,
541 # and they only get closed when the program terminates. There's no
542 # real benefit to closing them one-at-a-time after calling
543 # parse_logfile(), because the "scarce" resource of open file
544 # descriptors gets consumed immediately, before any processing has
545 # happened. In other words, if you're going to run out of file
546 # descriptors, it's going to happen right now.
547 #
548 # So anyway, don't run this on several million logfiles.
549 args = parser.parse_args()
550 for f in args.logfiles:
551 parse_logfile(f)
552
553
554 if __name__ == "__main__":
555 main()