From 32b88ddd2697216d75d11b0e90f3b94c1caf6282 Mon Sep 17 00:00:00 2001 From: Rahmadi Trimananda Date: Wed, 25 Oct 2017 09:17:04 -0700 Subject: [PATCH] First version of scripts for traffic analysis --- CAPture.py | 385 +++++++++++++++++++++++++++++++++++++++++ extract_from_tshark.py | 175 +++++++++++++++++++ 2 files changed, 560 insertions(+) create mode 100644 CAPture.py create mode 100644 extract_from_tshark.py diff --git a/CAPture.py b/CAPture.py new file mode 100644 index 0000000..4d6972a --- /dev/null +++ b/CAPture.py @@ -0,0 +1,385 @@ +#!/usr/local/bin/python2.7 + +""" ----------------------------------------------------------------------------- + CAPture - a pcap file analyzer and report generator + (c) 2017 - Rahmadi Trimananda + University of California, Irvine - Programming Language and Systems + ----------------------------------------------------------------------------- + Credits to tutorial: https://dpkt.readthedocs.io/en/latest/ + ----------------------------------------------------------------------------- +""" + +import datetime +import dpkt +from dpkt.compat import compat_ord + +import socket +import sys + +""" ----------------------------------------------------------------------------- + Global variable declarations + ----------------------------------------------------------------------------- +""" +# Command line arguments +INPUT = "-i" +OUTPUT = "-o" +POINT_TO_MANY = "-pm" +VERBOSE = "-v" + + +def mac_addr(address): + # Courtesy of: https://dpkt.readthedocs.io/en/latest/ + """ Convert a MAC address to a readable/printable string + Args: + address (str): a MAC address in hex form (e.g. '\x01\x02\x03\x04\x05\x06') + Returns: + str: Printable/readable MAC address + """ + return ':'.join('%02x' % compat_ord(b) for b in address) + + +def inet_to_str(inet): + # Courtesy of: https://dpkt.readthedocs.io/en/latest/ + """ Convert inet object to a string + Args: + inet (inet struct): inet network address + Returns: + str: Printable/readable IP address + """ + # First try ipv4 and then ipv6 + try: + return socket.inet_ntop(socket.AF_INET, inet) + except ValueError: + return socket.inet_ntop(socket.AF_INET6, inet) + + +def show_usage(): + """ Show usage of this Python script + """ + print "Usage: python CAPture.py [ -i .pcap ] [ -o .pcap ] [ -pm ] [ -v ]" + print + print "[ -o ] = output file" + print "[ -pm ] = point-to-many analysis" + print "[ -v ] = verbose output" + print "By default, this script does simple statistical analysis of IP, TCP, and UDP packets." + print "(c) 2017 - University of California, Irvine - Programming Language and Systems" + + +def show_progress(verbose, counter): + """ Show packet processing progress + Args: + verbose: verbose output (True/False) + counter: counter of all packets + """ + if verbose: + print "Processing packet number: ", counter + else: + if counter % 100000 == 0: + print "Processing %s packets..." % counter + + +def show_summary(counter, ip_counter, tcp_counter, udp_counter): + """ Show summary of statistics of PCAP file + Args: + counter: counter of all packets + ip_counter: counter of all IP packets + tcp_counter: counter of all TCP packets + udp_counter: counter of all UDP packets + """ + print + print "Total number of packets in the pcap file: ", counter + print "Total number of ip packets: ", ip_counter + print "Total number of tcp packets: ", tcp_counter + print "Total number of udp packets: ", udp_counter + print + + +def save_to_file(tbl_header, dictionary, filename_out): + """ Show summary of statistics of PCAP file + Args: + tbl_header: header for the saved table + dictionary: dictionary to be saved + filename_out: file name to save + """ + # Appending, not overwriting! + f = open(filename_out, 'a') + # Write the table header + f.write("\n\n" + str(tbl_header) + "\n"); + # Iterate over dictionary and write (key, value) pairs + for key, value in dictionary.iteritems(): + f.write(str(key) + ", " + str(value) + "\n") + + f.close() + print "Writing output to file: ", filename_out + + +def statistical_analysis(verbose, pcap, counter, ip_counter, tcp_counter, udp_counter): + """ This is the default analysis of packet statistics (generic) + Args: + verbose: verbose output (True/False) + pcap: object that handles PCAP file content + counter: counter of all packets + ip_counter: counter of all IP packets + tcp_counter: counter of all TCP packets + udp_counter: counter of all UDP packets + """ + for time_stamp, packet in pcap: + + counter += 1 + eth = dpkt.ethernet.Ethernet(packet) + + if verbose: + # Print out the timestamp in UTC + print "Timestamp: ", str(datetime.datetime.utcfromtimestamp(time_stamp)) + # Print out the MAC addresses + print "Ethernet frame: ", mac_addr(eth.src), mac_addr(eth.dst), eth.data.__class__.__name__ + + # Process only IP data + if not isinstance(eth.data, dpkt.ip.IP): + + is_ip = False + if verbose: + print "Non IP packet type not analyzed... skipping..." + else: + is_ip = True + + if is_ip: + ip = eth.data + ip_counter += 1 + + # Pull out fragment information (flags and offset all packed into off field, so use bitmasks) + do_not_fragment = bool(ip.off & dpkt.ip.IP_DF) + more_fragments = bool(ip.off & dpkt.ip.IP_MF) + fragment_offset = ip.off & dpkt.ip.IP_OFFMASK + + if verbose: + # Print out the complete IP information + print "IP: %s -> %s (len=%d ttl=%d DF=%d MF=%d offset=%d)\n" % \ + (inet_to_str(ip.src), inet_to_str(ip.dst), ip.len, ip.ttl, do_not_fragment, + more_fragments, fragment_offset) + + # Count TCP packets + if ip.p == dpkt.ip.IP_PROTO_TCP: + tcp_counter += 1 + + # Count UDP packets + if ip.p == dpkt.ip.IP_PROTO_UDP: + udp_counter += 1 + + show_progress(verbose, counter) + + # Print general statistics + show_summary(counter, ip_counter, tcp_counter, udp_counter) + + +def point_to_many_analysis(filename_out, dev_add, verbose, pcap, counter, ip_counter, + tcp_counter, udp_counter): + """ This analysis presents how 1 device (MAC address or IP address) communicates + to every other device in the analyzed PCAP file. + Args: + dev_add: device address (MAC or IP address) + verbose: verbose output (True/False) + pcap: object that handles PCAP file content + counter: counter of all packets + ip_counter: counter of all IP packets + tcp_counter: counter of all TCP packets + udp_counter: counter of all UDP packets + """ + # Dictionary that preserves the mapping between destination address to frequency + mac2freq = dict() + ip2freq = dict() + for time_stamp, packet in pcap: + + counter += 1 + eth = dpkt.ethernet.Ethernet(packet) + + # Save the timestamp and MAC addresses + tstamp = str(datetime.datetime.utcfromtimestamp(time_stamp)) + mac_src = mac_addr(eth.src) + mac_dst = mac_addr(eth.dst) + + # Process only IP data + if not isinstance(eth.data, dpkt.ip.IP): + + is_ip = False + if verbose: + print "Non IP packet type not analyzed... skipping..." + print + else: + is_ip = True + + if is_ip: + ip = eth.data + ip_counter += 1 + + # Pull out fragment information (flags and offset all packed into off field, so use bitmasks) + do_not_fragment = bool(ip.off & dpkt.ip.IP_DF) + more_fragments = bool(ip.off & dpkt.ip.IP_MF) + fragment_offset = ip.off & dpkt.ip.IP_OFFMASK + + # Save IP addresses + ip_src = inet_to_str(ip.src) + ip_dst = inet_to_str(ip.dst) + + if verbose: + # Print out the complete IP information + print "IP: %s -> %s (len=%d ttl=%d DF=%d MF=%d offset=%d)\n" % \ + (ip_src, ip_dst, ip.len, ip.ttl, do_not_fragment, + more_fragments, fragment_offset) + + # Categorize packets based on source device address + # Save the destination device addresses (point-to-many) + if dev_add == ip_src: + if ip_dst in ip2freq: + freq = ip2freq[ip_dst] + ip2freq[ip_dst] = freq + 1 + else: + ip2freq[ip_dst] = 1 + + if dev_add == mac_src: + if mac_dst in ip2freq: + freq = mac2freq[mac_dst] + mac2freq[mac_dst] = freq + 1 + else: + mac2freq[mac_dst] = 1 + + # Count TCP packets + if ip.p == dpkt.ip.IP_PROTO_TCP: + tcp_counter += 1 + + # Count UDP packets + if ip.p == dpkt.ip.IP_PROTO_UDP: + udp_counter += 1 + + show_progress(verbose, counter) + + # Print general statistics + show_summary(counter, ip_counter, tcp_counter, udp_counter) + # Save results into file if filename_out is not empty + if not filename_out == "": + print "Saving results into file: ", filename_out + ip_tbl_header = "Point-to-many Analysis - IP destinations for " + dev_add + mac_tbl_header = "Point-to-many Analysis - MAC destinations for " + dev_add + save_to_file(ip_tbl_header, ip2freq, filename_out) + save_to_file(mac_tbl_header, mac2freq, filename_out) + else: + print "Output file name is not specified... exitting now!" + + +def parse_cli_args(argv): + """ Parse command line arguments and store them in a dictionary + Args: + argv: list of command line arguments and their values + Returns: + str: dictionary that maps arguments to their values + """ + options = dict() + # First argument is "CAPture.py", so skip it + argv = argv[1:] + # Loop and collect arguments and their values + while argv: + print "Examining argument: ", argv[0] + # Check the first character of each argv list + # If it is a '-' then it is a command line argument + if argv[0][0] == '-': + if argv[0] == VERBOSE: + # We don't have value for the argument VERBOSE + options[argv[0]] = argv[0] + # Remove one command line argument and its value + argv = argv[1:] + else: + options[argv[0]] = argv[1] + # Remove one command line argument and its value + argv = argv[2:] + + return options + + +""" ----------------------------------------------------------------------------- + Main Running Methods + ----------------------------------------------------------------------------- +""" +def main(): + # Variable declarations + global CAP_EXTENSION + global PCAP_EXTENSION + global VERBOSE + global POINT_TO_MANY + + # Counters + counter = 0 + ip_counter = 0 + tcp_counter = 0 + udp_counter = 0 + # Booleans as flags + verbose = False + is_ip = True + is_statistical_analysis = True + is_point_to_many_analysis = False + # Names + filename_in = "" + filename_out = "" + dev_add = "" + + # Welcome message + print + print "Welcome to CAPture version 1.0 - A PCAP file instant analyzer!" + + # Get file name from user input + # Show usage if file name is not specified (only accept 1 file name for now) + if len(sys.argv) < 2: + show_usage() + print + return + + # Check and process sys.argv + options = parse_cli_args(sys.argv) + for key, value in options.iteritems(): + # Process "-i" - input PCAP file + if key == INPUT: + filename_in = value + elif key == OUTPUT: + filename_out = value + elif key == VERBOSE: + verbose = True + elif key == POINT_TO_MANY: + is_statistical_analysis = False + is_point_to_many_analysis = True + dev_add = value + + # Show manual again if input is not correct + if filename_in == "": + print "File name is empty!" + print + show_usage() + print + return + + # dev_add is needed for these analyses + if is_point_to_many_analysis and dev_add == "": + print "Device address is empty!" + print + show_usage() + print + return + + # One PCAP file name is specified - now analyze! + print "Analyzing PCAP file: ", filename_in + + # Opening and analyzing PCAP file + f = open(filename_in,'rb') + pcap = dpkt.pcap.Reader(f) + + # Choose from the existing options + if is_statistical_analysis: + statistical_analysis(verbose, pcap, counter, ip_counter, tcp_counter, udp_counter) + elif is_point_to_many_analysis: + point_to_many_analysis(filename_out, dev_add, verbose, pcap, counter, ip_counter, + tcp_counter, udp_counter) + + +if __name__ == "__main__": + # call main function since this is being run as the start + main() + + diff --git a/extract_from_tshark.py b/extract_from_tshark.py new file mode 100644 index 0000000..a66b556 --- /dev/null +++ b/extract_from_tshark.py @@ -0,0 +1,175 @@ +#!/usr/bin/python + +""" +Script used to extract only the needed information from JSON packet traces generated by +tshark from PCAPNG format +""" + +import os, sys +import json +import uuid + +from collections import OrderedDict + +json_key_source = "_source" +json_key_layers = "layers" + +json_key_ip = "ip" +json_key_tcp = "tcp" + +json_key_http = "http" +json_key_method = "method" +json_key_uri = "uri" +json_key_headers = "headers" +json_key_host = "host" + +json_key_http_req = json_key_http + ".request." +json_key_http_req_method = json_key_http_req + json_key_method +json_key_http_req_uri = json_key_http_req + json_key_uri +json_key_http_req_line = json_key_http_req + "line" + +json_key_pkt_comment = "pkt_comment" + +json_key_frame = "frame" +json_key_frame_num = json_key_frame + ".number" +json_key_frame_comment = json_key_frame + ".comment" +json_key_frame_ts = json_key_frame + ".time_epoch" + + +def make_unique(key, dct): + counter = 0 + unique_key = key + + while unique_key in dct: + counter += 1 + unique_key = '{}_{}'.format(key, counter) + return unique_key + + +def parse_object_pairs(pairs): + dct = OrderedDict() + for key, value in pairs: + if key in dct: + key = make_unique(key, dct) + dct[key] = value + + return dct + +def change_file(fpath): + for fn in os.listdir(fpath): + full_path = fpath + '/' + fn + + # Recursively go through all directories + if os.path.isdir(full_path): + change_file(full_path) + continue + + print full_path + with open(full_path, "r+") as jf: + # Since certain json 'keys' appear multiple times in our data, we have to make them + # unique first (we can't use regular json.load() or we lose some data points). From: + # https://stackoverflow.com/questions/29321677/python-json-parser-allow-duplicate-keys + decoder = json.JSONDecoder(object_pairs_hook=parse_object_pairs) + pcap_data = decoder.decode(jf.read()) + + # Prepare new data structure for re-formatted JSON storage + data = {} + for packet in pcap_data: + layers = packet[json_key_source][json_key_layers] + + # All captured traffic should have a frame + frame number, but check anyway + frame_num = " Frame: " + if json_key_frame not in layers or json_key_frame_num not in layers[json_key_frame]: + print "WARNING: could not find frame number! Using -1..." + frame_num = frame_num + "-1" + else: + # Save frame number for error-reporting + frame_num = frame_num + layers[json_key_frame][json_key_frame_num] + + # All captured traffic should be IP, but check anyway + if not json_key_ip in layers: + print "WARNING: Non-IP traffic detected!" + frame_num + continue + + # For now, focus on HTTP only + if json_key_tcp not in layers or json_key_http not in layers: + continue + + # Fill our new JSON packet with TCP/IP info + new_packet = {} + new_packet["dst_ip"] = layers[json_key_ip][json_key_ip + ".dst"] + new_packet["dst_port"] = int(layers[json_key_tcp][json_key_tcp + ".dstport"]) + + # Go through all HTTP fields and extract the ones that are needed + http_data = layers[json_key_http] + for http_key in http_data: + http_value = http_data[http_key] + + if http_key.startswith(json_key_http_req_line): + header_line = http_value.split(":", 1) + if len(header_line) != 2: + print ("WARNING: could not parse header '" + str(header_line) + "'" + + frame_num) + continue + + # Prepare container for HTTP headers + if json_key_headers not in new_packet: + new_packet[json_key_headers] = {} + + # Use lower case for header keys to stay consistent with our other data + header_key = header_line[0].lower() + + # Remove the trailing carriage return + header_val = header_line[1].strip() + + # Save the header key-value pair + new_packet[json_key_headers][header_key] = header_val + + # If this is the host header, we also save it to the main object + if header_key == json_key_host: + new_packet[json_key_host] = header_val + + if json_key_http_req_method in http_value: + new_packet[json_key_method] = http_value[json_key_http_req_method] + if json_key_http_req_uri in http_value: + new_packet[json_key_uri] = http_value[json_key_http_req_uri] + + # End of HTTP parsing + + # Check that we found the minimum needed HTTP headers + if (json_key_uri not in new_packet or json_key_method not in new_packet or + json_key_host not in new_packet): + print "Missing some HTTP Headers!" + frame_num + continue + + # Extract timestamp + if json_key_frame_ts not in layers[json_key_frame]: + print "WARNING: could not find timestamp!" + frame_num + continue + + new_packet["ts"] = layers[json_key_frame][json_key_frame_ts] + + # Now extract and parse the packet comment + if (json_key_pkt_comment not in layers or + json_key_frame_comment not in layers[json_key_pkt_comment]): + print "WARNING: no packet comment found!" + frame_num + continue + + comment = layers[json_key_pkt_comment][json_key_frame_comment] + comment_data = json.loads(comment) + for key in comment_data: + new_packet[str(key)] = str(comment_data[key]) + + # Create a unique key for each packet to keep consistent with ReCon + # Also good in case packets end up in different files + data[str(uuid.uuid4())] = new_packet + + # Write the new data + #print json.dumps(data, sort_keys=True, indent=4) + jf.seek(0) + jf.write(json.dumps(data, sort_keys=True, indent=4)) + jf.truncate() + +if __name__ == '__main__': + # Needed to re-use some JSON keys + change_file(sys.argv[1]) \ No newline at end of file -- 2.34.1