From: Michael Orlitzky Date: Fri, 16 Sep 2022 23:16:57 +0000 (-0400) Subject: */*: refactor into a driver program and a library. X-Git-Tag: 0.0.1~26 X-Git-Url: http://gitweb.michael.orlitzky.com/?p=djbdns-logparse.git;a=commitdiff_plain;h=3611c026366d615fb51e31e62cdedb7d68293c3e */*: refactor into a driver program and a library. Now there are a few separate modules in a "djbdns" package, and the djbdns-logparse executable that makes use of them contains only what the main() function used to. --- diff --git a/bin/djbdns-logparse b/bin/djbdns-logparse new file mode 100755 index 0000000..7808fbd --- /dev/null +++ b/bin/djbdns-logparse @@ -0,0 +1,33 @@ +#!/usr/bin/python3 +r""" +Convert tinydns and dnscache logs to human-readable form +""" +from argparse import ArgumentParser, FileType +from djbdns.io import parse_logfile + +# Create an argument parser using the file's docsctring as its +# description. +parser = ArgumentParser(description = __doc__) + +# Parse zero or more positional arguments into a list of +# "logfiles". If none are given, read from stdin instead. +from sys import stdin +parser.add_argument("logfiles", + metavar="LOGFILE", + type=FileType("r"), + nargs="*", + default=[stdin], + help="djbdns logfile to process (default: stdin)") + +# Warning: argparse automatically opens its file arguments here, +# and they only get closed when the program terminates. There's no +# real benefit to closing them one-at-a-time after calling +# parse_logfile(), because the "scarce" resource of open file +# descriptors gets consumed immediately, before any processing has +# happened. In other words, if you're going to run out of file +# descriptors, it's going to happen right now. +# +# So anyway, don't run this on several million logfiles. +args = parser.parse_args() +for f in args.logfiles: + parse_logfile(f) diff --git a/bin/djbdns-logparse.py b/bin/djbdns-logparse.py deleted file mode 100755 index 1df2bf8..0000000 --- a/bin/djbdns-logparse.py +++ /dev/null @@ -1,555 +0,0 @@ -#!/usr/bin/python3 -r""" -Convert tinydns and dnscache logs to human-readable form -""" - -import re, typing -from struct import pack - -## Regular expressions for matching tinydns/dnscache log lines. We -## compile these once here rather than within the corresponding -## matching functions, because the latter get executed repeatedly. - -# This first pattern is used to match the timestamp format that the -# tai64nlocal program produces. It appears in both dnscache and -# tinydns lines, after they've been piped through tai64nlocal, of -# course. -timestamp_pat = r'[\d-]+ [\d:\.]+' - -# The regex to match dnscache log lines. -dnscache_log_re = re.compile(fr'({timestamp_pat}) (\w+)(.*)') - -# The "hex4" pattern matches a string of four hexadecimal digits. This -# is used, for example, by tinydns to encode the query type -# identifier. -hex4_pat = r'[0-9a-f]{4}' - -# The IP pattern matches a string of either 8 or 32 hexadecimal -# characters, which correspond to IPv4 and IPv6 addresses, -# respectively, in tinydns logs. -ip_pat = r'[0-9a-f]{8,32}' - -# The regex to match tinydns log lines. -tinydns_log_re = re.compile( - rf'({timestamp_pat}) ({ip_pat}):({hex4_pat}):({hex4_pat}) ([\+\-IC/]) ({hex4_pat}) (.*)' -) - -# A dictionary mapping query type identifiers, in decimal, to their -# friendly names for tinydns. Reference: -# -# https://en.wikipedia.org/wiki/List_of_DNS_record_types -# -# Note that mapping here is non-exhaustive, and that tinydns will -# log responses for record types that it does not know about. -query_type = { - 1: "a", - 2: "ns", - 5: "cname", - 6: "soa", - 12: "ptr", - 13: "hinfo", - 15: "mx", - 16: "txt", - 17: "rp", - 24: "sig", - 25: "key", - 28: "aaaa", - 33: "srv", - 35: "naptr", - 38: "a6", - 48: "dnskey", - 52: "tlsa", - 65: "https", - 252: "axfr", - 255: "any", - 257: "caa" -} - -# tinydns can drop a query for one of three reasons; this dictionary -# maps the symbol that gets logged in each case to a human-readable -# reason. We include the "+" case here, indicating that the query was -# NOT dropped, to avoid a special case later on when we're formatting -# the human-readable output. -query_drop_reason = { - "+": None, - "-": "no authority", - "I": "invalid query", - "C": "invalid class", - "/": "couldn't parse" -} - - -def convert_ip(ip : str) -> str: - r""" - Convert a hex string representing an IP address to - human-readable form. - - Parameters - ---------- - - ip : str - The hexadecimal representation of either an IPv4 or an IPv6 - address. - - Returns - ------- - - The usual decimal dotted-quad representation is returned for an - IPv4 address. IPv6 addresses are returned almost as-is, but with - colons inserted in the appropriate places, between every four - characters. - - Examples - -------- - - >>> convert_ip("7f000001") - '127.0.0.1' - >>> convert_ip("00000000000000000000ffff7f000001") - '0000:0000:0000:0000:0000:ffff:7f00:0001' - """ - if len(ip) == 8: - # IPv4, eg. "7f000001" -> "7f 00 00 01" -> "127.0.0.1" - return ".".join(map(str, pack(">L", int(ip, 16)))) - elif len(ip) == 32: - # IPv6 is actually simpler -- it's just a string-slicing operation. - return ":".join([ip[(4*i) : (4*i+4)] for i in range(8)]) - - -def decode_client(words : list, i : int): - r""" - Helper function to decode the client field in a dnscache log - entry. - - There are two possible formats for the client field, - - 1. clientip:clientport, used by tcpopen/tcpclose entries, - 2. clientip:clientport:id, used by "query" entries. - - We convert each part from hex to decimal, and in the second - format, separate the packet id from the client information. - - Parameters - ---------- - - words : list - The ``words`` list (a list of fields) from - :func:`handle_dnscache_log`. - - i : int - The index of the client field within ``words`` - - Returns - ------- - - Nothing; the ``i``th entry in the ``words`` list is modified - in-place. - - Examples - -------- - - >>> words = ["foo", "bar", "7f000001:9253", "quux"] - >>> decode_client(words, 2) - >>> words - ['foo', 'bar', '127.0.0.1:37459', 'quux'] - - >>> words = ["foo", "7f000001:a3db:4fb9", "bar", "quux"] - >>> decode_client(words, 1) - >>> words - ['foo', '127.0.0.1:41947 (id 20409)', 'bar', 'quux'] - - """ - chunks = words[i].split(":") - - ip = convert_ip(chunks[0]) - port = int(chunks[1], 16) - words[i] = f"{ip}:{port}" - - if len(chunks) == 3: - # For a "query" entry's clientip:clientport:id field. - id = int(chunks[2], 16) - words[i] += f" (id {id})" - -def decode_ip(words : list, i : int): - r""" - Helper function to decode the ip field in a dnscache log - entry. - - A single "serverip" field is present in the lame, nodata, - nxdomain, and rr entry types. We convert it from hex to decimal. - - Parameters - ---------- - - words : list - The ``words`` list (a list of fields) from - :func:`handle_dnscache_log`. - - i : int - The index of the ip field within ``words`` - - Returns - ------- - - Nothing; the ``i``th entry in the ``words`` list is modified - in-place. - - Examples - -------- - - >>> words = ["foo", "bar", "7f000001", "quux"] - >>> decode_ip(words, 2) - >>> words - ['foo', 'bar', '127.0.0.1', 'quux'] - - >>> words = ["foo", "00000000000000000000ffff7f000001", "bar", "quux"] - >>> decode_ip(words, 1) - >>> words - ['foo', '0000:0000:0000:0000:0000:ffff:7f00:0001', 'bar', 'quux'] - """ - words[i] = convert_ip(words[i]) - -def decode_ttl(words : list, i : int): - r""" - Helper function to decode the ttl field in a dnscache log - entry. - - A single "ttl" field is present in the nodata, nxdomain, and - rr entry types. We prefix it with "TTL=" so that its meaning - is clear in the human-readable logs. - - Parameters - ---------- - - words : list - The ``words`` list (a list of fields) from - :func:`handle_dnscache_log`. - - i : int - The index of the ttl field within ``words`` - - Returns - ------- - - Nothing; the ``i``th entry in the ``words`` list is modified - in-place. - - Examples - -------- - - >>> words = ["c0a80101", "20865", "1", "www.example.com.", "5db8d822"] - >>> decode_ttl(words, 1) - >>> words - ['c0a80101', 'TTL=20865', '1', 'www.example.com.', '5db8d822'] - - """ - words[i] = f"TTL={words[i]}" - -def decode_serial(words : list, i : int): - r""" - Helper function to decode the serial field in a dnscache log - entry. - - A single "serial" field is present in the drop and query entry - types. It's already in decimal; we simply prefix it with a hash. - - Parameters - ---------- - - words : list - The ``words`` list (a list of fields) from - :func:`handle_dnscache_log`. - - i : int - The index of the serial field within ``words`` - - Returns - ------- - - Nothing; the ``i``th entry in the ``words`` list is modified - in-place. - - Examples - -------- - - >>> words = ["1", "7f000001:a3db:4fb9", "1", "www.example.com."] - >>> decode_serial(words, 0) - >>> words - ['#1', '7f000001:a3db:4fb9', '1', 'www.example.com.'] - - """ - words[i] = f"#{words[i]}" - -def decode_type(words : list, i : int): - qt = words[i] - words[i] = query_type.get(int(qt), qt) - -def handle_dnscache_log(line : str) -> typing.Optional[str]: - r""" - Handle a single log line if it matches the ``dnscache_log_re`` regex. - - Parameters - ---------- - - line : string - The log line that might match ``dnscache_log_re``. - - Returns - ------- - - Either the human-readable string if the log line was handled (that - is, if it was really a dnscache log line), or ``None`` if it was - not. - - Examples - -------- - - >>> line = "2022-09-15 18:37:33.863805500 query 1 7f000001:a3db:4fb9 1 www.example.com." - >>> handle_dnscache_log(line) - '2022-09-15 18:37:33.863805500 query #1 127.0.0.1:41947 (id 20409) a www.example.com.' - - >>> line = "2022-09-15 18:37:33.863874500 tx 0 1 www.example.com. . c0a80101" - >>> handle_dnscache_log(line) - '2022-09-15 18:37:33.863874500 tx g=0 a www.example.com. . 192.168.1.1' - - >>> line = "2022-09-15 18:37:33.878529500 rr c0a80101 20865 1 www.example.com. 5db8d822" - >>> handle_dnscache_log(line) - '2022-09-15 18:37:33.878529500 rr 192.168.1.1 TTL=20865 a www.example.com. 93.184.216.34' - - >>> line = "2022-09-15 18:37:33.878532500 stats 1 43 1 0" - >>> handle_dnscache_log(line) - '2022-09-15 18:37:33.878532500 stats count=1 motion=43 udp-active=1 tcp-active=0' - - >>> line = "2022-09-15 18:37:33.878602500 sent 1 49" - >>> handle_dnscache_log(line) - '2022-09-15 18:37:33.878602500 sent #1 49' - - >>> line = "this line is nonsense" - >>> handle_dnscache_log(line) - - """ - match = dnscache_log_re.match(line) - if not match: - return None - - (timestamp, event, data) = match.groups() - - words = data.split() - if event == "cached": - if words[0] not in ("cname", "ns", "nxdomain"): - decode_type(words, 0) - - elif event == "drop": - decode_serial(words, 0) - - elif event == "lame": - decode_ip(words, 0) - - elif event == "nodata": - decode_ip(words, 0) - decode_ttl(words, 1) - decode_type(words, 2) - - elif event == "nxdomain": - decode_ip(words, 0) - decode_ttl(words, 1) - - elif event == "query": - decode_serial(words, 0) - decode_client(words, 1) - decode_type(words, 2) - - elif event == "rr": - decode_ip(words, 0) - decode_ttl(words, 1) - if words[2] not in ("cname", "mx", "ns", "ptr", "soa"): - decode_type(words, 2) - if words[2] == "a": # decode answer to an A query - decode_ip(words, 4) - if words[2] == "txt": # text record - response = words[4] - if response.endswith("..."): - ellipsis = "..." - response = response[0:-3] - else: - ellipsis = "" - length = int(response[0:2], 16) - chars = [] - for i in range(1, len(response)//2): - chars.append(chr(int(response[2*i : (2*i)+2], 16))) - txt = "".join(chars) - words[4] = f"{length}:\"{txt}{ellipsis}\"" - - elif event == "sent": - decode_serial(words, 0) - - elif event == "stats": - words[0] = f"count={words[0]}" - words[1] = f"motion={words[1]}" - words[2] = f"udp-active={words[2]}" - words[3] = f"tcp-active={words[3]}" - - elif event == "tx": - words[0] = f"g={words[0]}" - decode_type(words, 1) - # words[2] = name - # words[3] = control (domain for which these servers are believed - # to be authoritative) - for i in range(4, len(words)): - decode_ip(words, i) - - elif event in ("tcpopen", "tcpclose"): - decode_client(words, 0) - - # Reconstitute "data" (i.e. everything after the timestamp and the - # event) from "words", which was originally obtained by splitting - # "data". - data = " ".join(words) - return f"{timestamp} {event} {data}" - - - -def handle_tinydns_log(line : str) -> typing.Optional[str]: - r""" - Handle a single log line if it matches the ``tinydns_log_re`` regex. - - Parameters - ---------- - - line : string - The log line that might match ``tinydns_log_re``. - - Returns - ------- - - Either the human-readable string if the log line was handled (that - is, if it was really a tinydns log line), or ``None`` if it was - not. - - Examples - -------- - - >>> line = "2022-09-14 21:04:40.206516500 7f000001:9d61:be69 - 0001 www.example.com" - >>> handle_tinydns_log(line) - '2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com' - - >>> line = "this line is nonsense" - >>> handle_tinydns_log(line) - - """ - match = tinydns_log_re.match(line) - if not match: - return None - - (timestamp, ip, port, id, code, type, name) = match.groups() - ip = convert_ip(ip) - port = int(port, 16) - id = int(id, 16) - - # Convert the "type" field to a human-readable record type name - # using the query_type dictionary. If the right name isn't present - # in the dictionary, we use the (decimal) type id instead. - type = int(type, 16) # "001c" -> 28 - type = query_type.get(type, type) # 28 -> "aaaa" - - line_tpl = "{timestamp} " - - reason = query_drop_reason[code] - if code == "+": - line_tpl += "sent response to {ip}:{port} (id {id}): {type} {name}" - else: - line_tpl += "dropped query ({reason}) from {ip}:{port}" - if code != "/": - # If the query can actually be parsed, the log line is a - # bit more informative than it would have been otherwise. - line_tpl += " (id {id}): {type} {name}" - - return line_tpl.format(timestamp=timestamp, - reason=reason, - ip=ip, - port=port, - id=id, - type=type, - name=name) - - -def parse_logfile(file : typing.TextIO): - r""" - Process a single log ``file``. - - Parameters - ---------- - - file : typing.TextIO - An open log file, or stdin. - - Examples - -------- - - >>> line = "@4000000063227a320c4f3114 7f000001:9d61:be69 - 0001 www.example.com\n" - >>> from tempfile import NamedTemporaryFile - >>> with NamedTemporaryFile(mode="w", delete=False) as f: - ... _ = f.write(line) - >>> f = open(f.name, 'r') - >>> parse_logfile(f) - 2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com - >>> f.close() - >>> from os import remove - >>> remove(f.name) - - """ - # Open a pipe to tai64nlocal. We'll write lines of our input file - # (the log file) to it, and read back the same lines but with - # friendly timestamps in them. - from subprocess import Popen, PIPE - tai = Popen(["tai64nlocal"], stdin=PIPE, stdout=PIPE, text=True, bufsize=0) - - for line in file: - tai.stdin.write(line) - line = tai.stdout.readline() - - friendly_line = handle_tinydns_log(line) - if not friendly_line: - friendly_line = handle_dnscache_log(line) - if not friendly_line: - friendly_line = line - - print(friendly_line) - -def main(): - r""" - The entry point to the program. - - This function is responsible only for parsing any command-line - arguments, and then calling :func`parse_logfile` on them. - """ - # Create an argument parser using the file's docsctring as its - # description. - from argparse import ArgumentParser, FileType - parser = ArgumentParser(description = __doc__) - - # Parse zero or more positional arguments into a list of - # "logfiles". If none are given, read from stdin instead. - from sys import stdin - parser.add_argument("logfiles", - metavar="LOGFILE", - type=FileType("r"), - nargs="*", - default=[stdin], - help="djbdns logfile to process (default: stdin)") - - # Warning: argparse automatically opens its file arguments here, - # and they only get closed when the program terminates. There's no - # real benefit to closing them one-at-a-time after calling - # parse_logfile(), because the "scarce" resource of open file - # descriptors gets consumed immediately, before any processing has - # happened. In other words, if you're going to run out of file - # descriptors, it's going to happen right now. - # - # So anyway, don't run this on several million logfiles. - args = parser.parse_args() - for f in args.logfiles: - parse_logfile(f) - - -if __name__ == "__main__": - main() diff --git a/setup.py b/setup.py index 622d2d4..4fe1afb 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,8 @@ setup( author_email = 'michael@orlitzky.com', url = 'http://michael.orlitzky.com/code/djbdns-logparse.xhtml', scripts = ['bin/djbdns-logparse'], + packages = ['djbdns'], + package_dir = {'djbdns': 'src/djbdns'}, description = 'Convert tinydns and dnscache logs to human-readable form', license = 'doc/LICENSE' ) diff --git a/src/djbdns/common.py b/src/djbdns/common.py new file mode 100644 index 0000000..6ef4e1c --- /dev/null +++ b/src/djbdns/common.py @@ -0,0 +1,72 @@ +from struct import pack + +# A pattern to match the timestamp format that the tai64nlocal program +# produces. It appears in both dnscache and tinydns lines, after +# they've been piped through tai64nlocal, of course. +timestamp_pat = r'[\d-]+ [\d:\.]+' + +# A dictionary mapping query type identifiers, in decimal, to their +# friendly names for tinydns. Reference: +# +# https://en.wikipedia.org/wiki/List_of_DNS_record_types +# +# Note that mapping here is non-exhaustive, and that tinydns will +# log responses for record types that it does not know about. +query_type = { + 1: "a", + 2: "ns", + 5: "cname", + 6: "soa", + 12: "ptr", + 13: "hinfo", + 15: "mx", + 16: "txt", + 17: "rp", + 24: "sig", + 25: "key", + 28: "aaaa", + 33: "srv", + 35: "naptr", + 38: "a6", + 48: "dnskey", + 52: "tlsa", + 65: "https", + 252: "axfr", + 255: "any", + 257: "caa" +} + +def convert_ip(ip : str) -> str: + r""" + Convert a hex string representing an IP address to + human-readable form. + + Parameters + ---------- + + ip : str + The hexadecimal representation of either an IPv4 or an IPv6 + address. + + Returns + ------- + + The usual decimal dotted-quad representation is returned for an + IPv4 address. IPv6 addresses are returned almost as-is, but with + colons inserted in the appropriate places, between every four + characters. + + Examples + -------- + + >>> convert_ip("7f000001") + '127.0.0.1' + >>> convert_ip("00000000000000000000ffff7f000001") + '0000:0000:0000:0000:0000:ffff:7f00:0001' + """ + if len(ip) == 8: + # IPv4, eg. "7f000001" -> "7f 00 00 01" -> "127.0.0.1" + return ".".join(map(str, pack(">L", int(ip, 16)))) + elif len(ip) == 32: + # IPv6 is actually simpler -- it's just a string-slicing operation. + return ":".join([ip[(4*i) : (4*i+4)] for i in range(8)]) diff --git a/src/djbdns/dnscache.py b/src/djbdns/dnscache.py new file mode 100644 index 0000000..ff1e4da --- /dev/null +++ b/src/djbdns/dnscache.py @@ -0,0 +1,298 @@ +from re import compile +from typing import Optional +from djbdns.common import * + +# The regex to match dnscache log lines. +dnscache_log_re = compile(fr'({timestamp_pat}) (\w+)(.*)') + + +def decode_client(words : list, i : int): + r""" + Helper function to decode the client field in a dnscache log + entry. + + There are two possible formats for the client field, + + 1. clientip:clientport, used by tcpopen/tcpclose entries, + 2. clientip:clientport:id, used by "query" entries. + + We convert each part from hex to decimal, and in the second + format, separate the packet id from the client information. + + Parameters + ---------- + + words : list + The ``words`` list (a list of fields) from + :func:`handle_dnscache_log`. + + i : int + The index of the client field within ``words`` + + Returns + ------- + + Nothing; the ``i``th entry in the ``words`` list is modified + in-place. + + Examples + -------- + + >>> words = ["foo", "bar", "7f000001:9253", "quux"] + >>> decode_client(words, 2) + >>> words + ['foo', 'bar', '127.0.0.1:37459', 'quux'] + + >>> words = ["foo", "7f000001:a3db:4fb9", "bar", "quux"] + >>> decode_client(words, 1) + >>> words + ['foo', '127.0.0.1:41947 (id 20409)', 'bar', 'quux'] + + """ + chunks = words[i].split(":") + + ip = convert_ip(chunks[0]) + port = int(chunks[1], 16) + words[i] = f"{ip}:{port}" + + if len(chunks) == 3: + # For a "query" entry's clientip:clientport:id field. + id = int(chunks[2], 16) + words[i] += f" (id {id})" + +def decode_ip(words : list, i : int): + r""" + Helper function to decode the ip field in a dnscache log + entry. + + A single "serverip" field is present in the lame, nodata, + nxdomain, and rr entry types. We convert it from hex to decimal. + + Parameters + ---------- + + words : list + The ``words`` list (a list of fields) from + :func:`handle_dnscache_log`. + + i : int + The index of the ip field within ``words`` + + Returns + ------- + + Nothing; the ``i``th entry in the ``words`` list is modified + in-place. + + Examples + -------- + + >>> words = ["foo", "bar", "7f000001", "quux"] + >>> decode_ip(words, 2) + >>> words + ['foo', 'bar', '127.0.0.1', 'quux'] + + >>> words = ["foo", "00000000000000000000ffff7f000001", "bar", "quux"] + >>> decode_ip(words, 1) + >>> words + ['foo', '0000:0000:0000:0000:0000:ffff:7f00:0001', 'bar', 'quux'] + """ + words[i] = convert_ip(words[i]) + +def decode_ttl(words : list, i : int): + r""" + Helper function to decode the ttl field in a dnscache log + entry. + + A single "ttl" field is present in the nodata, nxdomain, and + rr entry types. We prefix it with "TTL=" so that its meaning + is clear in the human-readable logs. + + Parameters + ---------- + + words : list + The ``words`` list (a list of fields) from + :func:`handle_dnscache_log`. + + i : int + The index of the ttl field within ``words`` + + Returns + ------- + + Nothing; the ``i``th entry in the ``words`` list is modified + in-place. + + Examples + -------- + + >>> words = ["c0a80101", "20865", "1", "www.example.com.", "5db8d822"] + >>> decode_ttl(words, 1) + >>> words + ['c0a80101', 'TTL=20865', '1', 'www.example.com.', '5db8d822'] + + """ + words[i] = f"TTL={words[i]}" + +def decode_serial(words : list, i : int): + r""" + Helper function to decode the serial field in a dnscache log + entry. + + A single "serial" field is present in the drop and query entry + types. It's already in decimal; we simply prefix it with a hash. + + Parameters + ---------- + + words : list + The ``words`` list (a list of fields) from + :func:`handle_dnscache_log`. + + i : int + The index of the serial field within ``words`` + + Returns + ------- + + Nothing; the ``i``th entry in the ``words`` list is modified + in-place. + + Examples + -------- + + >>> words = ["1", "7f000001:a3db:4fb9", "1", "www.example.com."] + >>> decode_serial(words, 0) + >>> words + ['#1', '7f000001:a3db:4fb9', '1', 'www.example.com.'] + + """ + words[i] = f"#{words[i]}" + +def decode_type(words : list, i : int): + qt = words[i] + words[i] = query_type.get(int(qt), qt) + +def handle_dnscache_log(line : str) -> Optional[str]: + r""" + Handle a single log line if it matches the ``dnscache_log_re`` regex. + + Parameters + ---------- + + line : string + The log line that might match ``dnscache_log_re``. + + Returns + ------- + + Either the human-readable string if the log line was handled (that + is, if it was really a dnscache log line), or ``None`` if it was + not. + + Examples + -------- + + >>> line = "2022-09-15 18:37:33.863805500 query 1 7f000001:a3db:4fb9 1 www.example.com." + >>> handle_dnscache_log(line) + '2022-09-15 18:37:33.863805500 query #1 127.0.0.1:41947 (id 20409) a www.example.com.' + + >>> line = "2022-09-15 18:37:33.863874500 tx 0 1 www.example.com. . c0a80101" + >>> handle_dnscache_log(line) + '2022-09-15 18:37:33.863874500 tx g=0 a www.example.com. . 192.168.1.1' + + >>> line = "2022-09-15 18:37:33.878529500 rr c0a80101 20865 1 www.example.com. 5db8d822" + >>> handle_dnscache_log(line) + '2022-09-15 18:37:33.878529500 rr 192.168.1.1 TTL=20865 a www.example.com. 93.184.216.34' + + >>> line = "2022-09-15 18:37:33.878532500 stats 1 43 1 0" + >>> handle_dnscache_log(line) + '2022-09-15 18:37:33.878532500 stats count=1 motion=43 udp-active=1 tcp-active=0' + + >>> line = "2022-09-15 18:37:33.878602500 sent 1 49" + >>> handle_dnscache_log(line) + '2022-09-15 18:37:33.878602500 sent #1 49' + + >>> line = "this line is nonsense" + >>> handle_dnscache_log(line) + + """ + match = dnscache_log_re.match(line) + if not match: + return None + + (timestamp, event, data) = match.groups() + + words = data.split() + if event == "cached": + if words[0] not in ("cname", "ns", "nxdomain"): + decode_type(words, 0) + + elif event == "drop": + decode_serial(words, 0) + + elif event == "lame": + decode_ip(words, 0) + + elif event == "nodata": + decode_ip(words, 0) + decode_ttl(words, 1) + decode_type(words, 2) + + elif event == "nxdomain": + decode_ip(words, 0) + decode_ttl(words, 1) + + elif event == "query": + decode_serial(words, 0) + decode_client(words, 1) + decode_type(words, 2) + + elif event == "rr": + decode_ip(words, 0) + decode_ttl(words, 1) + if words[2] not in ("cname", "mx", "ns", "ptr", "soa"): + decode_type(words, 2) + if words[2] == "a": # decode answer to an A query + decode_ip(words, 4) + if words[2] == "txt": # text record + response = words[4] + if response.endswith("..."): + ellipsis = "..." + response = response[0:-3] + else: + ellipsis = "" + length = int(response[0:2], 16) + chars = [] + for i in range(1, len(response)//2): + chars.append(chr(int(response[2*i : (2*i)+2], 16))) + txt = "".join(chars) + words[4] = f"{length}:\"{txt}{ellipsis}\"" + + elif event == "sent": + decode_serial(words, 0) + + elif event == "stats": + words[0] = f"count={words[0]}" + words[1] = f"motion={words[1]}" + words[2] = f"udp-active={words[2]}" + words[3] = f"tcp-active={words[3]}" + + elif event == "tx": + words[0] = f"g={words[0]}" + decode_type(words, 1) + # words[2] = name + # words[3] = control (domain for which these servers are believed + # to be authoritative) + for i in range(4, len(words)): + decode_ip(words, i) + + elif event in ("tcpopen", "tcpclose"): + decode_client(words, 0) + + # Reconstitute "data" (i.e. everything after the timestamp and the + # event) from "words", which was originally obtained by splitting + # "data". + data = " ".join(words) + return f"{timestamp} {event} {data}" diff --git a/src/djbdns/io.py b/src/djbdns/io.py new file mode 100644 index 0000000..eee86cb --- /dev/null +++ b/src/djbdns/io.py @@ -0,0 +1,46 @@ +from subprocess import Popen, PIPE +from typing import TextIO +from djbdns.dnscache import handle_dnscache_log +from djbdns.tinydns import handle_tinydns_log + +def parse_logfile(file : TextIO): + r""" + Process a single log ``file``. + + Parameters + ---------- + + file : typing.TextIO + An open log file, or stdin. + + Examples + -------- + + >>> line = "@4000000063227a320c4f3114 7f000001:9d61:be69 - 0001 www.example.com\n" + >>> from tempfile import NamedTemporaryFile + >>> with NamedTemporaryFile(mode="w", delete=False) as f: + ... _ = f.write(line) + >>> f = open(f.name, 'r') + >>> parse_logfile(f) + 2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com + >>> f.close() + >>> from os import remove + >>> remove(f.name) + + """ + # Open a pipe to tai64nlocal. We'll write lines of our input file + # (the log file) to it, and read back the same lines but with + # friendly timestamps in them. + tai = Popen(["tai64nlocal"], stdin=PIPE, stdout=PIPE, text=True, bufsize=0) + + for line in file: + tai.stdin.write(line) + line = tai.stdout.readline() + + friendly_line = handle_tinydns_log(line) + if not friendly_line: + friendly_line = handle_dnscache_log(line) + if not friendly_line: + friendly_line = line + + print(friendly_line) diff --git a/src/djbdns/tinydns.py b/src/djbdns/tinydns.py new file mode 100644 index 0000000..52b06a4 --- /dev/null +++ b/src/djbdns/tinydns.py @@ -0,0 +1,95 @@ +from re import compile +from typing import Optional +from djbdns.common import * + +# The "hex4" pattern matches a string of four hexadecimal digits. This +# is used, for example, by tinydns to encode the query type +# identifier. +hex4_pat = r'[0-9a-f]{4}' + +# The IP pattern matches a string of either 8 or 32 hexadecimal +# characters, which correspond to IPv4 and IPv6 addresses, +# respectively, in tinydns logs. +ip_pat = r'[0-9a-f]{8,32}' + +# The regex to match tinydns log lines. +tinydns_log_re = compile( + rf'({timestamp_pat}) ({ip_pat}):({hex4_pat}):({hex4_pat}) ([\+\-IC/]) ({hex4_pat}) (.*)' +) + +# tinydns can drop a query for one of three reasons; this dictionary +# maps the symbol that gets logged in each case to a human-readable +# reason. We include the "+" case here, indicating that the query was +# NOT dropped, to avoid a special case later on when we're formatting +# the human-readable output. +query_drop_reason = { + "+": None, + "-": "no authority", + "I": "invalid query", + "C": "invalid class", + "/": "couldn't parse" +} + + +def handle_tinydns_log(line : str) -> Optional[str]: + r""" + Handle a single log line if it matches the ``tinydns_log_re`` regex. + + Parameters + ---------- + + line : string + The log line that might match ``tinydns_log_re``. + + Returns + ------- + + Either the human-readable string if the log line was handled (that + is, if it was really a tinydns log line), or ``None`` if it was + not. + + Examples + -------- + + >>> line = "2022-09-14 21:04:40.206516500 7f000001:9d61:be69 - 0001 www.example.com" + >>> handle_tinydns_log(line) + '2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com' + + >>> line = "this line is nonsense" + >>> handle_tinydns_log(line) + + """ + match = tinydns_log_re.match(line) + if not match: + return None + + (timestamp, ip, port, id, code, type, name) = match.groups() + ip = convert_ip(ip) + port = int(port, 16) + id = int(id, 16) + + # Convert the "type" field to a human-readable record type name + # using the query_type dictionary. If the right name isn't present + # in the dictionary, we use the (decimal) type id instead. + type = int(type, 16) # "001c" -> 28 + type = query_type.get(type, type) # 28 -> "aaaa" + + line_tpl = "{timestamp} " + + reason = query_drop_reason[code] + if code == "+": + line_tpl += "sent response to {ip}:{port} (id {id}): {type} {name}" + else: + line_tpl += "dropped query ({reason}) from {ip}:{port}" + if code != "/": + # If the query can actually be parsed, the log line is a + # bit more informative than it would have been otherwise. + line_tpl += " (id {id}): {type} {name}" + + return line_tpl.format(timestamp=timestamp, + reason=reason, + ip=ip, + port=port, + id=id, + type=type, + name=name)