X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=origin%2Fbase_gefx_generator.py;fp=origin%2Fbase_gefx_generator.py;h=703fe456bbcdc70e5b06546c9f946fa3316eca9b;hb=60a643ee2dc29700849e61d9788cbbb9af419955;hp=0000000000000000000000000000000000000000;hpb=d4cc1378fd51bb35671f962a182bddf978201dca;p=pingpong.git diff --git a/origin/base_gefx_generator.py b/origin/base_gefx_generator.py new file mode 100644 index 0000000..703fe45 --- /dev/null +++ b/origin/base_gefx_generator.py @@ -0,0 +1,126 @@ +#!/usr/bin/python + +""" +Script that constructs a graph in which hosts are nodes. +An edge between two hosts indicate that the hosts communicate. +Hosts are labeled and identified by their IPs. +The graph is written to a file in Graph Exchange XML format for later import and visual inspection in Gephi. + +The input to this script is the JSON output by extract_from_tshark.py by Anastasia Shuba. + +This script is a simplification of Milad Asgari's parser_data_to_gephi.py script. +It serves as a baseline for future scripts that want to include more information in the graph. +""" + +import socket +import json +import tldextract +import networkx as nx +import sys +from decimal import * + +import parse_dns + +JSON_KEY_ETH_SRC = "eth.src" +JSON_KEY_ETH_DST = "eth.dst" + +def parse_json(file_path): + + device_dns_mappings = parse_dns.parse_json_dns("./dns.json") + + # Init empty graph + G = nx.DiGraph() + with open(file_path) as jf: + # Read JSON. + # data becomes reference to root JSON object (or in our case json array) + data = json.load(jf) + # Loop through json objects in data + for k in data: + # Fetch timestamp of packet + packet_timestamp = Decimal(data[k]["ts"]) + # Fetch eth source and destination info + eth_src = data[k][JSON_KEY_ETH_SRC] + eth_dst = data[k][JSON_KEY_ETH_DST] + # Traffic can be both outbound and inbound. + # Determine which one of the two by looking up device MAC in DNS map. + iot_device = None + if eth_src in device_dns_mappings: + iot_device = eth_src + elif eth_dst in device_dns_mappings: + iot_device = eth_dst + else: + print "[ WARNING: DNS mapping not found for device with MAC", eth_src, "OR", eth_dst, "]" + # This must be local communication between two IoT devices OR an IoT device talking to a hardcoded IP. + # For now let's assume local communication. + # Add a node for each device and an edge between them. + G.add_node(eth_src) + G.add_node(eth_dst) + G.add_edge(eth_src, eth_dst) + # TODO add regex check on src+dst IP to figure out if hardcoded server IP (e.g. check if one of the two are NOT a 192.168.x.y IP) + continue + # It is outbound traffic if iot_device matches src, otherwise it must be inbound traffic. + outbound_traffic = iot_device == eth_src + + ''' Graph construction ''' + # No need to check if the Nodes and/or Edges we add already exist: + # NetworkX won't add already existing nodes/edges (except in the case of a MultiGraph or MultiDiGraph (see NetworkX doc)). + + # Add a node for each host. + # First add node for IoT device. + G.add_node(iot_device) + # Then add node for the server. + # For this we need to distinguish between outbound and inbound traffic so that we look up the proper IP in our DNS map. + # For outbound traffic, the server's IP is the destination IP. + # For inbound traffic, the server's IP is the source IP. + server_ip = data[k]["dst_ip"] if outbound_traffic else data[k]["src_ip"] + hostname = device_dns_mappings[iot_device].hostname_for_ip_at_time(server_ip, packet_timestamp) + if hostname is None: + # TODO this can occur when two local devices communicate OR if IoT device has hardcoded server IP. + # However, we only get here for the DNS that have not performed any DNS lookups + # We should use a regex check early in the loop to see if it is two local devices communicating. + # This way we would not have to consider these corner cases later on. + print "[ WARNING: no ip-hostname mapping found for ip", server_ip, " -- adding eth.src->eth.dst edge, but note that this may be incorrect if IoT device has hardcoded server IP ]" + G.add_node(eth_src) + G.add_node(eth_dst) + G.add_edge(eth_src, eth_dst) + continue + G.add_node(hostname) + # Connect the two nodes we just added. + if outbound_traffic: + G.add_edge(iot_device, hostname) + else: + G.add_edge(hostname, iot_device) + return G + +# ------------------------------------------------------ +# Not currently used. +# Might be useful later on if we wish to resolve IPs. +def get_domain(host): + ext_result = tldextract.extract(str(host)) + # Be consistent with ReCon and keep suffix + domain = ext_result.domain + "." + ext_result.suffix + return domain + +def is_IP(addr): + try: + socket.inet_aton(addr) + return True + except socket.error: + return False +# ------------------------------------------------------ + +if __name__ == '__main__': + if len(sys.argv) < 3: + print "Usage:", sys.argv[0], "input_file output_file" + print "outfile_file should end in .gexf" + sys.exit(0) + # Input file: Path to JSON file generated from tshark JSON output using Anastasia's script (extract_from_tshark.py). + input_file = sys.argv[1] + print "[ input_file =", input_file, "]" + # Output file: Path to file where the Gephi XML should be written. + output_file = sys.argv[2] + print "[ output_file =", output_file, "]" + # Construct graph from JSON + G = parse_json(input_file) + # Write Graph in Graph Exchange XML format + nx.write_gexf(G, output_file)